Issue #12892: The utf-16* and utf-32* codecs now reject (lone) surrogates.

The utf-16* and utf-32* encoders no longer allow surrogate code points
(U+D800-U+DFFF) to be encoded.
The utf-32* decoders no longer decode byte sequences that correspond to
surrogate code points.
The surrogatepass error handler now works with the utf-16* and utf-32* codecs.

Based on patches by Victor Stinner and Kang-Hao (Kenny) Lu.
This commit is contained in:
Serhiy Storchaka 2013-11-19 11:32:41 +02:00
parent a938bcfe95
commit 58cf607d13
8 changed files with 639 additions and 78 deletions

View file

@ -753,6 +753,65 @@ PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)
}
}
#define ENC_UTF8 0
#define ENC_UTF16BE 1
#define ENC_UTF16LE 2
#define ENC_UTF32BE 3
#define ENC_UTF32LE 4
static int
get_standard_encoding(const char *encoding, int *bytelength)
{
if (Py_TOLOWER(encoding[0]) == 'u' &&
Py_TOLOWER(encoding[1]) == 't' &&
Py_TOLOWER(encoding[2]) == 'f') {
encoding += 3;
if (*encoding == '-' || *encoding == '_' )
encoding++;
if (encoding[0] == '1' && encoding[1] == '6') {
encoding += 2;
*bytelength = 2;
if (*encoding == '\0') {
#ifdef WORDS_BIGENDIAN
return ENC_UTF16BE;
#else
return ENC_UTF16LE;
#endif
}
if (*encoding == '-' || *encoding == '_' )
encoding++;
if (Py_TOLOWER(encoding[1]) == 'e' && encoding[2] == '\0') {
if (Py_TOLOWER(encoding[0]) == 'b')
return ENC_UTF16BE;
if (Py_TOLOWER(encoding[0]) == 'l')
return ENC_UTF16LE;
}
}
else if (encoding[0] == '3' && encoding[1] == '2') {
encoding += 2;
*bytelength = 4;
if (*encoding == '\0') {
#ifdef WORDS_BIGENDIAN
return ENC_UTF32BE;
#else
return ENC_UTF32LE;
#endif
}
if (*encoding == '-' || *encoding == '_' )
encoding++;
if (Py_TOLOWER(encoding[1]) == 'e' && encoding[2] == '\0') {
if (Py_TOLOWER(encoding[0]) == 'b')
return ENC_UTF32BE;
if (Py_TOLOWER(encoding[0]) == 'l')
return ENC_UTF32LE;
}
}
}
/* utf-8 */
*bytelength = 3;
return ENC_UTF8;
}
/* This handler is declared static until someone demonstrates
a need to call it directly. */
static PyObject *
@ -760,24 +819,40 @@ PyCodec_SurrogatePassErrors(PyObject *exc)
{
PyObject *restuple;
PyObject *object;
PyObject *encode;
char *encoding;
int code;
int bytelength;
Py_ssize_t i;
Py_ssize_t start;
Py_ssize_t end;
PyObject *res;
if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
char *outp;
unsigned char *outp;
if (PyUnicodeEncodeError_GetStart(exc, &start))
return NULL;
if (PyUnicodeEncodeError_GetEnd(exc, &end))
return NULL;
if (!(object = PyUnicodeEncodeError_GetObject(exc)))
return NULL;
res = PyBytes_FromStringAndSize(NULL, 3*(end-start));
if (!(encode = PyUnicodeEncodeError_GetEncoding(exc))) {
Py_DECREF(object);
return NULL;
}
if (!(encoding = PyUnicode_AsUTF8(encode))) {
Py_DECREF(object);
Py_DECREF(encode);
return NULL;
}
code = get_standard_encoding(encoding, &bytelength);
Py_DECREF(encode);
res = PyBytes_FromStringAndSize(NULL, bytelength*(end-start));
if (!res) {
Py_DECREF(object);
return NULL;
}
outp = PyBytes_AsString(res);
outp = (unsigned char*)PyBytes_AsString(res);
for (i = start; i < end; i++) {
/* object is guaranteed to be "ready" */
Py_UCS4 ch = PyUnicode_READ_CHAR(object, i);
@ -788,9 +863,33 @@ PyCodec_SurrogatePassErrors(PyObject *exc)
Py_DECREF(object);
return NULL;
}
*outp++ = (char)(0xe0 | (ch >> 12));
*outp++ = (char)(0x80 | ((ch >> 6) & 0x3f));
*outp++ = (char)(0x80 | (ch & 0x3f));
switch (code) {
case ENC_UTF8:
*outp++ = (unsigned char)(0xe0 | (ch >> 12));
*outp++ = (unsigned char)(0x80 | ((ch >> 6) & 0x3f));
*outp++ = (unsigned char)(0x80 | (ch & 0x3f));
break;
case ENC_UTF16LE:
*outp++ = (unsigned char) ch;
*outp++ = (unsigned char)(ch >> 8);
break;
case ENC_UTF16BE:
*outp++ = (unsigned char)(ch >> 8);
*outp++ = (unsigned char) ch;
break;
case ENC_UTF32LE:
*outp++ = (unsigned char) ch;
*outp++ = (unsigned char)(ch >> 8);
*outp++ = (unsigned char)(ch >> 16);
*outp++ = (unsigned char)(ch >> 24);
break;
case ENC_UTF32BE:
*outp++ = (unsigned char)(ch >> 24);
*outp++ = (unsigned char)(ch >> 16);
*outp++ = (unsigned char)(ch >> 8);
*outp++ = (unsigned char) ch;
break;
}
}
restuple = Py_BuildValue("(On)", res, end);
Py_DECREF(res);
@ -802,34 +901,64 @@ PyCodec_SurrogatePassErrors(PyObject *exc)
Py_UCS4 ch = 0;
if (PyUnicodeDecodeError_GetStart(exc, &start))
return NULL;
if (PyUnicodeDecodeError_GetEnd(exc, &end))
return NULL;
if (!(object = PyUnicodeDecodeError_GetObject(exc)))
return NULL;
if (!(p = (unsigned char*)PyBytes_AsString(object))) {
Py_DECREF(object);
return NULL;
}
if (!(encode = PyUnicodeDecodeError_GetEncoding(exc))) {
Py_DECREF(object);
return NULL;
}
if (!(encoding = PyUnicode_AsUTF8(encode))) {
Py_DECREF(object);
Py_DECREF(encode);
return NULL;
}
code = get_standard_encoding(encoding, &bytelength);
Py_DECREF(encode);
/* Try decoding a single surrogate character. If
there are more, let the codec call us again. */
p += start;
if (PyBytes_GET_SIZE(object) - start >= 3 &&
(p[0] & 0xf0) == 0xe0 &&
(p[1] & 0xc0) == 0x80 &&
(p[2] & 0xc0) == 0x80) {
/* it's a three-byte code */
ch = ((p[0] & 0x0f) << 12) + ((p[1] & 0x3f) << 6) + (p[2] & 0x3f);
if (!Py_UNICODE_IS_SURROGATE(ch))
/* it's not a surrogate - fail */
ch = 0;
if (PyBytes_GET_SIZE(object) - start >= bytelength) {
switch (code) {
case ENC_UTF8:
if ((p[0] & 0xf0) == 0xe0 &&
(p[1] & 0xc0) == 0x80 &&
(p[2] & 0xc0) == 0x80) {
/* it's a three-byte code */
ch = ((p[0] & 0x0f) << 12) + ((p[1] & 0x3f) << 6) + (p[2] & 0x3f);
}
break;
case ENC_UTF16LE:
ch = p[1] << 8 | p[0];
break;
case ENC_UTF16BE:
ch = p[0] << 8 | p[1];
break;
case ENC_UTF32LE:
ch = (p[3] << 24) | (p[2] << 16) | (p[1] << 8) | p[0];
break;
case ENC_UTF32BE:
ch = (p[0] << 24) | (p[1] << 16) | (p[2] << 8) | p[3];
break;
}
}
Py_DECREF(object);
if (ch == 0) {
if (!Py_UNICODE_IS_SURROGATE(ch)) {
/* it's not a surrogate - fail */
PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
return NULL;
}
res = PyUnicode_FromOrdinal(ch);
if (res == NULL)
return NULL;
return Py_BuildValue("(Nn)", res, start+3);
return Py_BuildValue("(Nn)", res, start + bytelength);
}
else {
wrong_exception_type(exc);