mirror of
https://github.com/python/cpython.git
synced 2025-09-26 18:29:57 +00:00
Issue #3672: Reject surrogates in utf-8 codec; add surrogates error
handler.
This commit is contained in:
parent
02953d244f
commit
db12d454e6
9 changed files with 202 additions and 21 deletions
|
@ -323,6 +323,18 @@ and implemented by all standard Python codecs:
|
||||||
| | (only for encoding). |
|
| | (only for encoding). |
|
||||||
+-------------------------+-----------------------------------------------+
|
+-------------------------+-----------------------------------------------+
|
||||||
|
|
||||||
|
In addition, the following error handlers are specific to a single codec:
|
||||||
|
|
||||||
|
+------------------+---------+--------------------------------------------+
|
||||||
|
| Value | Codec | Meaning |
|
||||||
|
+==================+=========+============================================+
|
||||||
|
| ``'surrogates'`` | utf-8 | Allow encoding and decoding of surrogate |
|
||||||
|
| | | codes in UTF-8. |
|
||||||
|
+------------------+---------+--------------------------------------------+
|
||||||
|
|
||||||
|
.. versionadded:: 3.1
|
||||||
|
The ``'surrogates'`` error handler.
|
||||||
|
|
||||||
The set of allowed values can be extended via :meth:`register_error`.
|
The set of allowed values can be extended via :meth:`register_error`.
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -169,13 +169,13 @@ class BaseBytesTest(unittest.TestCase):
|
||||||
self.assertEqual(b[start:stop:step], self.type2test(L[start:stop:step]))
|
self.assertEqual(b[start:stop:step], self.type2test(L[start:stop:step]))
|
||||||
|
|
||||||
def test_encoding(self):
|
def test_encoding(self):
|
||||||
sample = "Hello world\n\u1234\u5678\u9abc\udef0"
|
sample = "Hello world\n\u1234\u5678\u9abc"
|
||||||
for enc in ("utf8", "utf16"):
|
for enc in ("utf8", "utf16"):
|
||||||
b = self.type2test(sample, enc)
|
b = self.type2test(sample, enc)
|
||||||
self.assertEqual(b, self.type2test(sample.encode(enc)))
|
self.assertEqual(b, self.type2test(sample.encode(enc)))
|
||||||
self.assertRaises(UnicodeEncodeError, self.type2test, sample, "latin1")
|
self.assertRaises(UnicodeEncodeError, self.type2test, sample, "latin1")
|
||||||
b = self.type2test(sample, "latin1", "ignore")
|
b = self.type2test(sample, "latin1", "ignore")
|
||||||
self.assertEqual(b, self.type2test(sample[:-4], "utf-8"))
|
self.assertEqual(b, self.type2test(sample[:-3], "utf-8"))
|
||||||
|
|
||||||
def test_decode(self):
|
def test_decode(self):
|
||||||
sample = "Hello world\n\u1234\u5678\u9abc\def0\def0"
|
sample = "Hello world\n\u1234\u5678\u9abc\def0\def0"
|
||||||
|
|
|
@ -541,6 +541,17 @@ class UTF8Test(ReadTest):
|
||||||
self.check_state_handling_decode(self.encoding,
|
self.check_state_handling_decode(self.encoding,
|
||||||
u, u.encode(self.encoding))
|
u, u.encode(self.encoding))
|
||||||
|
|
||||||
|
def test_lone_surrogates(self):
|
||||||
|
self.assertRaises(UnicodeEncodeError, "\ud800".encode, "utf-8")
|
||||||
|
self.assertRaises(UnicodeDecodeError, b"\xed\xa0\x80".decode, "utf-8")
|
||||||
|
|
||||||
|
def test_surrogates_handler(self):
|
||||||
|
self.assertEquals("abc\ud800def".encode("utf-8", "surrogates"),
|
||||||
|
b"abc\xed\xa0\x80def")
|
||||||
|
self.assertEquals(b"abc\xed\xa0\x80def".decode("utf-8", "surrogates"),
|
||||||
|
"abc\ud800def")
|
||||||
|
self.assertTrue(codecs.lookup_error("surrogates"))
|
||||||
|
|
||||||
class UTF7Test(ReadTest):
|
class UTF7Test(ReadTest):
|
||||||
encoding = "utf-7"
|
encoding = "utf-7"
|
||||||
|
|
||||||
|
@ -1023,12 +1034,12 @@ class NameprepTest(unittest.TestCase):
|
||||||
# Skipped
|
# Skipped
|
||||||
continue
|
continue
|
||||||
# The Unicode strings are given in UTF-8
|
# The Unicode strings are given in UTF-8
|
||||||
orig = str(orig, "utf-8")
|
orig = str(orig, "utf-8", "surrogates")
|
||||||
if prepped is None:
|
if prepped is None:
|
||||||
# Input contains prohibited characters
|
# Input contains prohibited characters
|
||||||
self.assertRaises(UnicodeError, nameprep, orig)
|
self.assertRaises(UnicodeError, nameprep, orig)
|
||||||
else:
|
else:
|
||||||
prepped = str(prepped, "utf-8")
|
prepped = str(prepped, "utf-8", "surrogates")
|
||||||
try:
|
try:
|
||||||
self.assertEquals(nameprep(orig), prepped)
|
self.assertEquals(nameprep(orig), prepped)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
|
|
@ -886,10 +886,10 @@ class UnicodeTest(
|
||||||
self.assertEqual('\u20ac'.encode('utf-8'), b'\xe2\x82\xac')
|
self.assertEqual('\u20ac'.encode('utf-8'), b'\xe2\x82\xac')
|
||||||
self.assertEqual('\ud800\udc02'.encode('utf-8'), b'\xf0\x90\x80\x82')
|
self.assertEqual('\ud800\udc02'.encode('utf-8'), b'\xf0\x90\x80\x82')
|
||||||
self.assertEqual('\ud84d\udc56'.encode('utf-8'), b'\xf0\xa3\x91\x96')
|
self.assertEqual('\ud84d\udc56'.encode('utf-8'), b'\xf0\xa3\x91\x96')
|
||||||
self.assertEqual('\ud800'.encode('utf-8'), b'\xed\xa0\x80')
|
self.assertEqual('\ud800'.encode('utf-8', 'surrogates'), b'\xed\xa0\x80')
|
||||||
self.assertEqual('\udc00'.encode('utf-8'), b'\xed\xb0\x80')
|
self.assertEqual('\udc00'.encode('utf-8', 'surrogates'), b'\xed\xb0\x80')
|
||||||
self.assertEqual(
|
self.assertEqual(
|
||||||
('\ud800\udc02'*1000).encode('utf-8'),
|
('\ud800\udc02'*1000).encode('utf-8', 'surrogates'),
|
||||||
b'\xf0\x90\x80\x82'*1000
|
b'\xf0\x90\x80\x82'*1000
|
||||||
)
|
)
|
||||||
self.assertEqual(
|
self.assertEqual(
|
||||||
|
|
|
@ -13,6 +13,7 @@ import subprocess
|
||||||
import test.support
|
import test.support
|
||||||
|
|
||||||
encoding = 'utf-8'
|
encoding = 'utf-8'
|
||||||
|
errors = 'surrogates'
|
||||||
|
|
||||||
|
|
||||||
### Run tests
|
### Run tests
|
||||||
|
@ -61,7 +62,7 @@ class UnicodeMethodsTest(unittest.TestCase):
|
||||||
(char + 'ABC').title(),
|
(char + 'ABC').title(),
|
||||||
|
|
||||||
]
|
]
|
||||||
h.update(''.join(data).encode(encoding))
|
h.update(''.join(data).encode(encoding, errors))
|
||||||
result = h.hexdigest()
|
result = h.hexdigest()
|
||||||
self.assertEqual(result, self.expectedchecksum)
|
self.assertEqual(result, self.expectedchecksum)
|
||||||
|
|
||||||
|
|
|
@ -12,6 +12,8 @@ What's New in Python 3.1 beta 1?
|
||||||
Core and Builtins
|
Core and Builtins
|
||||||
-----------------
|
-----------------
|
||||||
|
|
||||||
|
- Issue #3672: Reject surrogates in utf-8 codec; add surrogates error handler.
|
||||||
|
|
||||||
- Issue #5883: In the io module, the BufferedIOBase and TextIOBase ABCs have
|
- Issue #5883: In the io module, the BufferedIOBase and TextIOBase ABCs have
|
||||||
received a new method, detach(). detach() disconnects the underlying stream
|
received a new method, detach(). detach() disconnects the underlying stream
|
||||||
from the buffer or text IO and returns it.
|
from the buffer or text IO and returns it.
|
||||||
|
|
|
@ -154,6 +154,11 @@ const unsigned char _Py_ascii_whitespace[] = {
|
||||||
0, 0, 0, 0, 0, 0, 0, 0
|
0, 0, 0, 0, 0, 0, 0, 0
|
||||||
};
|
};
|
||||||
|
|
||||||
|
static PyObject *unicode_encode_call_errorhandler(const char *errors,
|
||||||
|
PyObject **errorHandler,const char *encoding, const char *reason,
|
||||||
|
const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
|
||||||
|
Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
|
||||||
|
|
||||||
/* Same for linebreaks */
|
/* Same for linebreaks */
|
||||||
static unsigned char ascii_linebreak[] = {
|
static unsigned char ascii_linebreak[] = {
|
||||||
0, 0, 0, 0, 0, 0, 0, 0,
|
0, 0, 0, 0, 0, 0, 0, 0,
|
||||||
|
@ -2214,14 +2219,7 @@ PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
|
||||||
goto utf8Error;
|
goto utf8Error;
|
||||||
}
|
}
|
||||||
ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
|
ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
|
||||||
if (ch < 0x0800) {
|
if (ch < 0x0800 || (ch >= 0xd800 && ch <= 0xDFFF)) {
|
||||||
/* Note: UTF-8 encodings of surrogates are considered
|
|
||||||
legal UTF-8 sequences;
|
|
||||||
|
|
||||||
XXX For wide builds (UCS-4) we should probably try
|
|
||||||
to recombine the surrogates into a single code
|
|
||||||
unit.
|
|
||||||
*/
|
|
||||||
errmsg = "illegal encoding";
|
errmsg = "illegal encoding";
|
||||||
startinpos = s-starts;
|
startinpos = s-starts;
|
||||||
endinpos = startinpos+3;
|
endinpos = startinpos+3;
|
||||||
|
@ -2328,6 +2326,8 @@ PyUnicode_EncodeUTF8(const Py_UNICODE *s,
|
||||||
Py_ssize_t nallocated; /* number of result bytes allocated */
|
Py_ssize_t nallocated; /* number of result bytes allocated */
|
||||||
Py_ssize_t nneeded; /* number of result bytes needed */
|
Py_ssize_t nneeded; /* number of result bytes needed */
|
||||||
char stackbuf[MAX_SHORT_UNICHARS * 4];
|
char stackbuf[MAX_SHORT_UNICHARS * 4];
|
||||||
|
PyObject *errorHandler = NULL;
|
||||||
|
PyObject *exc = NULL;
|
||||||
|
|
||||||
assert(s != NULL);
|
assert(s != NULL);
|
||||||
assert(size >= 0);
|
assert(size >= 0);
|
||||||
|
@ -2367,6 +2367,7 @@ PyUnicode_EncodeUTF8(const Py_UNICODE *s,
|
||||||
else {
|
else {
|
||||||
/* Encode UCS2 Unicode ordinals */
|
/* Encode UCS2 Unicode ordinals */
|
||||||
if (ch < 0x10000) {
|
if (ch < 0x10000) {
|
||||||
|
#ifndef Py_UNICODE_WIDE
|
||||||
/* Special case: check for high surrogate */
|
/* Special case: check for high surrogate */
|
||||||
if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
|
if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
|
||||||
Py_UCS4 ch2 = s[i];
|
Py_UCS4 ch2 = s[i];
|
||||||
|
@ -2379,6 +2380,36 @@ PyUnicode_EncodeUTF8(const Py_UNICODE *s,
|
||||||
}
|
}
|
||||||
/* Fall through: handles isolated high surrogates */
|
/* Fall through: handles isolated high surrogates */
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
if (ch >= 0xd800 && ch <= 0xdfff) {
|
||||||
|
Py_ssize_t newpos;
|
||||||
|
PyObject *rep;
|
||||||
|
char *prep;
|
||||||
|
int k;
|
||||||
|
rep = unicode_encode_call_errorhandler
|
||||||
|
(errors, &errorHandler, "utf-8", "surrogates not allowed",
|
||||||
|
s, size, &exc, i-1, i, &newpos);
|
||||||
|
if (!rep)
|
||||||
|
goto error;
|
||||||
|
/* Implementation limitations: only support error handler that return
|
||||||
|
bytes, and only support up to four replacement bytes. */
|
||||||
|
if (!PyBytes_Check(rep)) {
|
||||||
|
PyErr_SetString(PyExc_TypeError, "error handler should have returned bytes");
|
||||||
|
Py_DECREF(rep);
|
||||||
|
goto error;
|
||||||
|
}
|
||||||
|
if (PyBytes_Size(rep) > 4) {
|
||||||
|
PyErr_SetString(PyExc_TypeError, "error handler returned too many bytes");
|
||||||
|
Py_DECREF(rep);
|
||||||
|
goto error;
|
||||||
|
}
|
||||||
|
prep = PyBytes_AsString(rep);
|
||||||
|
for(k = PyBytes_Size(rep); k > 0; k--)
|
||||||
|
*p++ = *prep++;
|
||||||
|
Py_DECREF(rep);
|
||||||
|
continue;
|
||||||
|
|
||||||
|
}
|
||||||
*p++ = (char)(0xe0 | (ch >> 12));
|
*p++ = (char)(0xe0 | (ch >> 12));
|
||||||
*p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
|
*p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
|
||||||
*p++ = (char)(0x80 | (ch & 0x3f));
|
*p++ = (char)(0x80 | (ch & 0x3f));
|
||||||
|
@ -2405,7 +2436,14 @@ PyUnicode_EncodeUTF8(const Py_UNICODE *s,
|
||||||
assert(nneeded <= nallocated);
|
assert(nneeded <= nallocated);
|
||||||
_PyBytes_Resize(&result, nneeded);
|
_PyBytes_Resize(&result, nneeded);
|
||||||
}
|
}
|
||||||
|
Py_XDECREF(errorHandler);
|
||||||
|
Py_XDECREF(exc);
|
||||||
return result;
|
return result;
|
||||||
|
error:
|
||||||
|
Py_XDECREF(errorHandler);
|
||||||
|
Py_XDECREF(exc);
|
||||||
|
Py_XDECREF(result);
|
||||||
|
return NULL;
|
||||||
|
|
||||||
#undef MAX_SHORT_UNICHARS
|
#undef MAX_SHORT_UNICHARS
|
||||||
}
|
}
|
||||||
|
@ -3897,7 +3935,7 @@ static PyObject *unicode_encode_call_errorhandler(const char *errors,
|
||||||
Py_ssize_t startpos, Py_ssize_t endpos,
|
Py_ssize_t startpos, Py_ssize_t endpos,
|
||||||
Py_ssize_t *newpos)
|
Py_ssize_t *newpos)
|
||||||
{
|
{
|
||||||
static char *argparse = "O!n;encoding error handler must return (str, int) tuple";
|
static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
|
||||||
|
|
||||||
PyObject *restuple;
|
PyObject *restuple;
|
||||||
PyObject *resunicode;
|
PyObject *resunicode;
|
||||||
|
@ -3918,15 +3956,20 @@ static PyObject *unicode_encode_call_errorhandler(const char *errors,
|
||||||
if (restuple == NULL)
|
if (restuple == NULL)
|
||||||
return NULL;
|
return NULL;
|
||||||
if (!PyTuple_Check(restuple)) {
|
if (!PyTuple_Check(restuple)) {
|
||||||
PyErr_SetString(PyExc_TypeError, &argparse[4]);
|
PyErr_SetString(PyExc_TypeError, &argparse[3]);
|
||||||
Py_DECREF(restuple);
|
Py_DECREF(restuple);
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
|
if (!PyArg_ParseTuple(restuple, argparse,
|
||||||
&resunicode, newpos)) {
|
&resunicode, newpos)) {
|
||||||
Py_DECREF(restuple);
|
Py_DECREF(restuple);
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
|
||||||
|
PyErr_SetString(PyExc_TypeError, &argparse[3]);
|
||||||
|
Py_DECREF(restuple);
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
if (*newpos<0)
|
if (*newpos<0)
|
||||||
*newpos = size+*newpos;
|
*newpos = size+*newpos;
|
||||||
if (*newpos<0 || *newpos>size) {
|
if (*newpos<0 || *newpos>size) {
|
||||||
|
@ -4064,6 +4107,12 @@ static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
|
||||||
collstart-startp, collend-startp, &newpos);
|
collstart-startp, collend-startp, &newpos);
|
||||||
if (repunicode == NULL)
|
if (repunicode == NULL)
|
||||||
goto onError;
|
goto onError;
|
||||||
|
if (!PyUnicode_Check(repunicode)) {
|
||||||
|
/* Implementation limitation: byte results not supported yet. */
|
||||||
|
PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
|
||||||
|
Py_DECREF(repunicode);
|
||||||
|
goto onError;
|
||||||
|
}
|
||||||
/* need more space? (at least enough for what we
|
/* need more space? (at least enough for what we
|
||||||
have+the replacement+the rest of the string, so
|
have+the replacement+the rest of the string, so
|
||||||
we won't have to check space for encodable characters) */
|
we won't have to check space for encodable characters) */
|
||||||
|
@ -5027,6 +5076,12 @@ int charmap_encoding_error(
|
||||||
collstartpos, collendpos, &newpos);
|
collstartpos, collendpos, &newpos);
|
||||||
if (repunicode == NULL)
|
if (repunicode == NULL)
|
||||||
return -1;
|
return -1;
|
||||||
|
if (!PyUnicode_Check(repunicode)) {
|
||||||
|
/* Implementation limitation: byte results not supported yet. */
|
||||||
|
PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
|
||||||
|
Py_DECREF(repunicode);
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
/* generate replacement */
|
/* generate replacement */
|
||||||
repsize = PyUnicode_GET_SIZE(repunicode);
|
repsize = PyUnicode_GET_SIZE(repunicode);
|
||||||
for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
|
for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
|
||||||
|
@ -5588,6 +5643,12 @@ int PyUnicode_EncodeDecimal(Py_UNICODE *s,
|
||||||
collstart-s, collend-s, &newpos);
|
collstart-s, collend-s, &newpos);
|
||||||
if (repunicode == NULL)
|
if (repunicode == NULL)
|
||||||
goto onError;
|
goto onError;
|
||||||
|
if (!PyUnicode_Check(repunicode)) {
|
||||||
|
/* Implementation limitation: byte results not supported yet. */
|
||||||
|
PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
|
||||||
|
Py_DECREF(repunicode);
|
||||||
|
goto onError;
|
||||||
|
}
|
||||||
/* generate replacement */
|
/* generate replacement */
|
||||||
repsize = PyUnicode_GET_SIZE(repunicode);
|
repsize = PyUnicode_GET_SIZE(repunicode);
|
||||||
for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
|
for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
|
||||||
|
|
|
@ -748,6 +748,85 @@ PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
PyObject *PyCodec_SurrogateErrors(PyObject *exc)
|
||||||
|
{
|
||||||
|
PyObject *restuple;
|
||||||
|
PyObject *object;
|
||||||
|
Py_ssize_t start;
|
||||||
|
Py_ssize_t end;
|
||||||
|
PyObject *res;
|
||||||
|
if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
|
||||||
|
Py_UNICODE *p;
|
||||||
|
Py_UNICODE *startp;
|
||||||
|
char *outp;
|
||||||
|
if (PyUnicodeEncodeError_GetStart(exc, &start))
|
||||||
|
return NULL;
|
||||||
|
if (PyUnicodeEncodeError_GetEnd(exc, &end))
|
||||||
|
return NULL;
|
||||||
|
if (!(object = PyUnicodeEncodeError_GetObject(exc)))
|
||||||
|
return NULL;
|
||||||
|
startp = PyUnicode_AS_UNICODE(object);
|
||||||
|
res = PyBytes_FromStringAndSize(NULL, 3*(end-start));
|
||||||
|
if (!res) {
|
||||||
|
Py_DECREF(object);
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
outp = PyBytes_AsString(res);
|
||||||
|
for (p = startp+start; p < startp+end; p++) {
|
||||||
|
Py_UNICODE ch = *p;
|
||||||
|
if (ch < 0xd800 || ch > 0xdfff) {
|
||||||
|
/* Not a surrogate, fail with original exception */
|
||||||
|
PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
|
||||||
|
Py_DECREF(res);
|
||||||
|
Py_DECREF(object);
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
*outp++ = (char)(0xe0 | (ch >> 12));
|
||||||
|
*outp++ = (char)(0x80 | ((ch >> 6) & 0x3f));
|
||||||
|
*outp++ = (char)(0x80 | (ch & 0x3f));
|
||||||
|
}
|
||||||
|
restuple = Py_BuildValue("(On)", res, end);
|
||||||
|
Py_DECREF(res);
|
||||||
|
Py_DECREF(object);
|
||||||
|
return restuple;
|
||||||
|
}
|
||||||
|
else if (PyObject_IsInstance(exc, PyExc_UnicodeDecodeError)) {
|
||||||
|
unsigned char *p;
|
||||||
|
Py_UNICODE ch = 0;
|
||||||
|
if (PyUnicodeDecodeError_GetStart(exc, &start))
|
||||||
|
return NULL;
|
||||||
|
if (!(object = PyUnicodeDecodeError_GetObject(exc)))
|
||||||
|
return NULL;
|
||||||
|
if (!(p = (unsigned char*)PyBytes_AsString(object))) {
|
||||||
|
Py_DECREF(object);
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
/* Try decoding a single surrogate character. If
|
||||||
|
there are more, let the codec call us again. */
|
||||||
|
p += start;
|
||||||
|
if ((p[0] & 0xf0) == 0xe0 ||
|
||||||
|
(p[1] & 0xc0) == 0x80 ||
|
||||||
|
(p[2] & 0xc0) == 0x80) {
|
||||||
|
/* it's a three-byte code */
|
||||||
|
ch = ((p[0] & 0x0f) << 12) + ((p[1] & 0x3f) << 6) + (p[2] & 0x3f);
|
||||||
|
if (ch < 0xd800 || ch > 0xdfff)
|
||||||
|
/* it's not a surrogate - fail */
|
||||||
|
ch = 0;
|
||||||
|
}
|
||||||
|
Py_DECREF(object);
|
||||||
|
if (ch == 0) {
|
||||||
|
PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
return Py_BuildValue("(u#n)", &ch, 1, start+3);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
wrong_exception_type(exc);
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
static PyObject *strict_errors(PyObject *self, PyObject *exc)
|
static PyObject *strict_errors(PyObject *self, PyObject *exc)
|
||||||
{
|
{
|
||||||
return PyCodec_StrictErrors(exc);
|
return PyCodec_StrictErrors(exc);
|
||||||
|
@ -777,6 +856,11 @@ static PyObject *backslashreplace_errors(PyObject *self, PyObject *exc)
|
||||||
return PyCodec_BackslashReplaceErrors(exc);
|
return PyCodec_BackslashReplaceErrors(exc);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static PyObject *surrogates_errors(PyObject *self, PyObject *exc)
|
||||||
|
{
|
||||||
|
return PyCodec_SurrogateErrors(exc);
|
||||||
|
}
|
||||||
|
|
||||||
static int _PyCodecRegistry_Init(void)
|
static int _PyCodecRegistry_Init(void)
|
||||||
{
|
{
|
||||||
static struct {
|
static struct {
|
||||||
|
@ -823,6 +907,14 @@ static int _PyCodecRegistry_Init(void)
|
||||||
backslashreplace_errors,
|
backslashreplace_errors,
|
||||||
METH_O
|
METH_O
|
||||||
}
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"surrogates",
|
||||||
|
{
|
||||||
|
"surrogates",
|
||||||
|
surrogates_errors,
|
||||||
|
METH_O
|
||||||
|
}
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
|
@ -312,7 +312,9 @@ w_object(PyObject *v, WFILE *p)
|
||||||
}
|
}
|
||||||
else if (PyUnicode_CheckExact(v)) {
|
else if (PyUnicode_CheckExact(v)) {
|
||||||
PyObject *utf8;
|
PyObject *utf8;
|
||||||
utf8 = PyUnicode_AsUTF8String(v);
|
utf8 = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(v),
|
||||||
|
PyUnicode_GET_SIZE(v),
|
||||||
|
"surrogates");
|
||||||
if (utf8 == NULL) {
|
if (utf8 == NULL) {
|
||||||
p->depth--;
|
p->depth--;
|
||||||
p->error = WFERR_UNMARSHALLABLE;
|
p->error = WFERR_UNMARSHALLABLE;
|
||||||
|
@ -810,7 +812,7 @@ r_object(RFILE *p)
|
||||||
retval = NULL;
|
retval = NULL;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
v = PyUnicode_DecodeUTF8(buffer, n, NULL);
|
v = PyUnicode_DecodeUTF8(buffer, n, "surrogates");
|
||||||
PyMem_DEL(buffer);
|
PyMem_DEL(buffer);
|
||||||
retval = v;
|
retval = v;
|
||||||
break;
|
break;
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue