mirror of
https://github.com/python/cpython.git
synced 2025-08-03 16:39:00 +00:00
Issue #12892: The utf-16* and utf-32* codecs now reject (lone) surrogates.
The utf-16* and utf-32* encoders no longer allow surrogate code points (U+D800-U+DFFF) to be encoded. The utf-32* decoders no longer decode byte sequences that correspond to surrogate code points. The surrogatepass error handler now works with the utf-16* and utf-32* codecs. Based on patches by Victor Stinner and Kang-Hao (Kenny) Lu.
This commit is contained in:
parent
a938bcfe95
commit
58cf607d13
8 changed files with 639 additions and 78 deletions
163
Python/codecs.c
163
Python/codecs.c
|
@ -753,6 +753,65 @@ PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)
|
|||
}
|
||||
}
|
||||
|
||||
#define ENC_UTF8 0
|
||||
#define ENC_UTF16BE 1
|
||||
#define ENC_UTF16LE 2
|
||||
#define ENC_UTF32BE 3
|
||||
#define ENC_UTF32LE 4
|
||||
|
||||
static int
|
||||
get_standard_encoding(const char *encoding, int *bytelength)
|
||||
{
|
||||
if (Py_TOLOWER(encoding[0]) == 'u' &&
|
||||
Py_TOLOWER(encoding[1]) == 't' &&
|
||||
Py_TOLOWER(encoding[2]) == 'f') {
|
||||
encoding += 3;
|
||||
if (*encoding == '-' || *encoding == '_' )
|
||||
encoding++;
|
||||
if (encoding[0] == '1' && encoding[1] == '6') {
|
||||
encoding += 2;
|
||||
*bytelength = 2;
|
||||
if (*encoding == '\0') {
|
||||
#ifdef WORDS_BIGENDIAN
|
||||
return ENC_UTF16BE;
|
||||
#else
|
||||
return ENC_UTF16LE;
|
||||
#endif
|
||||
}
|
||||
if (*encoding == '-' || *encoding == '_' )
|
||||
encoding++;
|
||||
if (Py_TOLOWER(encoding[1]) == 'e' && encoding[2] == '\0') {
|
||||
if (Py_TOLOWER(encoding[0]) == 'b')
|
||||
return ENC_UTF16BE;
|
||||
if (Py_TOLOWER(encoding[0]) == 'l')
|
||||
return ENC_UTF16LE;
|
||||
}
|
||||
}
|
||||
else if (encoding[0] == '3' && encoding[1] == '2') {
|
||||
encoding += 2;
|
||||
*bytelength = 4;
|
||||
if (*encoding == '\0') {
|
||||
#ifdef WORDS_BIGENDIAN
|
||||
return ENC_UTF32BE;
|
||||
#else
|
||||
return ENC_UTF32LE;
|
||||
#endif
|
||||
}
|
||||
if (*encoding == '-' || *encoding == '_' )
|
||||
encoding++;
|
||||
if (Py_TOLOWER(encoding[1]) == 'e' && encoding[2] == '\0') {
|
||||
if (Py_TOLOWER(encoding[0]) == 'b')
|
||||
return ENC_UTF32BE;
|
||||
if (Py_TOLOWER(encoding[0]) == 'l')
|
||||
return ENC_UTF32LE;
|
||||
}
|
||||
}
|
||||
}
|
||||
/* utf-8 */
|
||||
*bytelength = 3;
|
||||
return ENC_UTF8;
|
||||
}
|
||||
|
||||
/* This handler is declared static until someone demonstrates
|
||||
a need to call it directly. */
|
||||
static PyObject *
|
||||
|
@ -760,24 +819,40 @@ PyCodec_SurrogatePassErrors(PyObject *exc)
|
|||
{
|
||||
PyObject *restuple;
|
||||
PyObject *object;
|
||||
PyObject *encode;
|
||||
char *encoding;
|
||||
int code;
|
||||
int bytelength;
|
||||
Py_ssize_t i;
|
||||
Py_ssize_t start;
|
||||
Py_ssize_t end;
|
||||
PyObject *res;
|
||||
if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
|
||||
char *outp;
|
||||
unsigned char *outp;
|
||||
if (PyUnicodeEncodeError_GetStart(exc, &start))
|
||||
return NULL;
|
||||
if (PyUnicodeEncodeError_GetEnd(exc, &end))
|
||||
return NULL;
|
||||
if (!(object = PyUnicodeEncodeError_GetObject(exc)))
|
||||
return NULL;
|
||||
res = PyBytes_FromStringAndSize(NULL, 3*(end-start));
|
||||
if (!(encode = PyUnicodeEncodeError_GetEncoding(exc))) {
|
||||
Py_DECREF(object);
|
||||
return NULL;
|
||||
}
|
||||
if (!(encoding = PyUnicode_AsUTF8(encode))) {
|
||||
Py_DECREF(object);
|
||||
Py_DECREF(encode);
|
||||
return NULL;
|
||||
}
|
||||
code = get_standard_encoding(encoding, &bytelength);
|
||||
Py_DECREF(encode);
|
||||
|
||||
res = PyBytes_FromStringAndSize(NULL, bytelength*(end-start));
|
||||
if (!res) {
|
||||
Py_DECREF(object);
|
||||
return NULL;
|
||||
}
|
||||
outp = PyBytes_AsString(res);
|
||||
outp = (unsigned char*)PyBytes_AsString(res);
|
||||
for (i = start; i < end; i++) {
|
||||
/* object is guaranteed to be "ready" */
|
||||
Py_UCS4 ch = PyUnicode_READ_CHAR(object, i);
|
||||
|
@ -788,9 +863,33 @@ PyCodec_SurrogatePassErrors(PyObject *exc)
|
|||
Py_DECREF(object);
|
||||
return NULL;
|
||||
}
|
||||
*outp++ = (char)(0xe0 | (ch >> 12));
|
||||
*outp++ = (char)(0x80 | ((ch >> 6) & 0x3f));
|
||||
*outp++ = (char)(0x80 | (ch & 0x3f));
|
||||
switch (code) {
|
||||
case ENC_UTF8:
|
||||
*outp++ = (unsigned char)(0xe0 | (ch >> 12));
|
||||
*outp++ = (unsigned char)(0x80 | ((ch >> 6) & 0x3f));
|
||||
*outp++ = (unsigned char)(0x80 | (ch & 0x3f));
|
||||
break;
|
||||
case ENC_UTF16LE:
|
||||
*outp++ = (unsigned char) ch;
|
||||
*outp++ = (unsigned char)(ch >> 8);
|
||||
break;
|
||||
case ENC_UTF16BE:
|
||||
*outp++ = (unsigned char)(ch >> 8);
|
||||
*outp++ = (unsigned char) ch;
|
||||
break;
|
||||
case ENC_UTF32LE:
|
||||
*outp++ = (unsigned char) ch;
|
||||
*outp++ = (unsigned char)(ch >> 8);
|
||||
*outp++ = (unsigned char)(ch >> 16);
|
||||
*outp++ = (unsigned char)(ch >> 24);
|
||||
break;
|
||||
case ENC_UTF32BE:
|
||||
*outp++ = (unsigned char)(ch >> 24);
|
||||
*outp++ = (unsigned char)(ch >> 16);
|
||||
*outp++ = (unsigned char)(ch >> 8);
|
||||
*outp++ = (unsigned char) ch;
|
||||
break;
|
||||
}
|
||||
}
|
||||
restuple = Py_BuildValue("(On)", res, end);
|
||||
Py_DECREF(res);
|
||||
|
@ -802,34 +901,64 @@ PyCodec_SurrogatePassErrors(PyObject *exc)
|
|||
Py_UCS4 ch = 0;
|
||||
if (PyUnicodeDecodeError_GetStart(exc, &start))
|
||||
return NULL;
|
||||
if (PyUnicodeDecodeError_GetEnd(exc, &end))
|
||||
return NULL;
|
||||
if (!(object = PyUnicodeDecodeError_GetObject(exc)))
|
||||
return NULL;
|
||||
if (!(p = (unsigned char*)PyBytes_AsString(object))) {
|
||||
Py_DECREF(object);
|
||||
return NULL;
|
||||
}
|
||||
if (!(encode = PyUnicodeDecodeError_GetEncoding(exc))) {
|
||||
Py_DECREF(object);
|
||||
return NULL;
|
||||
}
|
||||
if (!(encoding = PyUnicode_AsUTF8(encode))) {
|
||||
Py_DECREF(object);
|
||||
Py_DECREF(encode);
|
||||
return NULL;
|
||||
}
|
||||
code = get_standard_encoding(encoding, &bytelength);
|
||||
Py_DECREF(encode);
|
||||
|
||||
/* Try decoding a single surrogate character. If
|
||||
there are more, let the codec call us again. */
|
||||
p += start;
|
||||
if (PyBytes_GET_SIZE(object) - start >= 3 &&
|
||||
(p[0] & 0xf0) == 0xe0 &&
|
||||
(p[1] & 0xc0) == 0x80 &&
|
||||
(p[2] & 0xc0) == 0x80) {
|
||||
/* it's a three-byte code */
|
||||
ch = ((p[0] & 0x0f) << 12) + ((p[1] & 0x3f) << 6) + (p[2] & 0x3f);
|
||||
if (!Py_UNICODE_IS_SURROGATE(ch))
|
||||
/* it's not a surrogate - fail */
|
||||
ch = 0;
|
||||
if (PyBytes_GET_SIZE(object) - start >= bytelength) {
|
||||
switch (code) {
|
||||
case ENC_UTF8:
|
||||
if ((p[0] & 0xf0) == 0xe0 &&
|
||||
(p[1] & 0xc0) == 0x80 &&
|
||||
(p[2] & 0xc0) == 0x80) {
|
||||
/* it's a three-byte code */
|
||||
ch = ((p[0] & 0x0f) << 12) + ((p[1] & 0x3f) << 6) + (p[2] & 0x3f);
|
||||
}
|
||||
break;
|
||||
case ENC_UTF16LE:
|
||||
ch = p[1] << 8 | p[0];
|
||||
break;
|
||||
case ENC_UTF16BE:
|
||||
ch = p[0] << 8 | p[1];
|
||||
break;
|
||||
case ENC_UTF32LE:
|
||||
ch = (p[3] << 24) | (p[2] << 16) | (p[1] << 8) | p[0];
|
||||
break;
|
||||
case ENC_UTF32BE:
|
||||
ch = (p[0] << 24) | (p[1] << 16) | (p[2] << 8) | p[3];
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
Py_DECREF(object);
|
||||
if (ch == 0) {
|
||||
if (!Py_UNICODE_IS_SURROGATE(ch)) {
|
||||
/* it's not a surrogate - fail */
|
||||
PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
|
||||
return NULL;
|
||||
}
|
||||
res = PyUnicode_FromOrdinal(ch);
|
||||
if (res == NULL)
|
||||
return NULL;
|
||||
return Py_BuildValue("(Nn)", res, start+3);
|
||||
return Py_BuildValue("(Nn)", res, start + bytelength);
|
||||
}
|
||||
else {
|
||||
wrong_exception_type(exc);
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue