Issue #12892: The utf-16* and utf-32* codecs now reject (lone) surrogates.

The utf-16* and utf-32* encoders no longer allow surrogate code points (U+D800-U+DFFF) to be encoded. The utf-32* decoders no longer decode byte sequences that correspond to surrogate code points. The surrogatepass error handler now works with the utf-16* and utf-32* codecs. Based on patches by Victor Stinner and Kang-Hao (Kenny) Lu.
2025-11-03 11:23:31 +00:00 · 2013-11-19 11:32:41 +02:00 · 2013-11-19 11:32:41 +02:00 · 58cf607d13
commit 58cf607d13
parent a938bcfe95
8 changed files with 639 additions and 78 deletions
--- a/Python/codecs.c
+++ b/Python/codecs.c
@ -753,6 +753,65 @@ PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)
    }
 }

+#define ENC_UTF8        0
+#define ENC_UTF16BE     1
+#define ENC_UTF16LE     2
+#define ENC_UTF32BE     3
+#define ENC_UTF32LE     4
+
+static int
+get_standard_encoding(const char *encoding, int *bytelength)
+{
+    if (Py_TOLOWER(encoding[0]) == 'u' &&
+        Py_TOLOWER(encoding[1]) == 't' &&
+        Py_TOLOWER(encoding[2]) == 'f') {
+        encoding += 3;
+        if (*encoding == '-' || *encoding == '_' )
+            encoding++;
+        if (encoding[0] == '1' && encoding[1] == '6') {
+            encoding += 2;
+            *bytelength = 2;
+            if (*encoding == '\0') {
+#ifdef WORDS_BIGENDIAN
+                return ENC_UTF16BE;
+#else
+                return ENC_UTF16LE;
+#endif
+            }
+            if (*encoding == '-' || *encoding == '_' )
+                encoding++;
+            if (Py_TOLOWER(encoding[1]) == 'e' && encoding[2] == '\0') {
+                if (Py_TOLOWER(encoding[0]) == 'b')
+                    return ENC_UTF16BE;
+                if (Py_TOLOWER(encoding[0]) == 'l')
+                    return ENC_UTF16LE;
+            }
+        }
+        else if (encoding[0] == '3' && encoding[1] == '2') {
+            encoding += 2;
+            *bytelength = 4;
+            if (*encoding == '\0') {
+#ifdef WORDS_BIGENDIAN
+                return ENC_UTF32BE;
+#else
+                return ENC_UTF32LE;
+#endif
+            }
+            if (*encoding == '-' || *encoding == '_' )
+                encoding++;
+            if (Py_TOLOWER(encoding[1]) == 'e' && encoding[2] == '\0') {
+                if (Py_TOLOWER(encoding[0]) == 'b')
+                    return ENC_UTF32BE;
+                if (Py_TOLOWER(encoding[0]) == 'l')
+                    return ENC_UTF32LE;
+            }
+        }
+    }
+    /* utf-8 */
+    *bytelength = 3;
+    return ENC_UTF8;
+}
+
 /* This handler is declared static until someone demonstrates
   a need to call it directly. */
 static PyObject *
@ -760,24 +819,40 @@ PyCodec_SurrogatePassErrors(PyObject *exc)
 {
    PyObject *restuple;
    PyObject *object;
+    PyObject *encode;
+    char *encoding;
+    int code;
+    int bytelength;
    Py_ssize_t i;
    Py_ssize_t start;
    Py_ssize_t end;
    PyObject *res;
    if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
-        char *outp;
+        unsigned char *outp;
        if (PyUnicodeEncodeError_GetStart(exc, &start))
            return NULL;
        if (PyUnicodeEncodeError_GetEnd(exc, &end))
            return NULL;
        if (!(object = PyUnicodeEncodeError_GetObject(exc)))
            return NULL;
-        res = PyBytes_FromStringAndSize(NULL, 3*(end-start));
+        if (!(encode = PyUnicodeEncodeError_GetEncoding(exc))) {
+            Py_DECREF(object);
+            return NULL;
+        }
+        if (!(encoding = PyUnicode_AsUTF8(encode))) {
+            Py_DECREF(object);
+            Py_DECREF(encode);
+            return NULL;
+        }
+        code = get_standard_encoding(encoding, &bytelength);
+        Py_DECREF(encode);
+
+        res = PyBytes_FromStringAndSize(NULL, bytelength*(end-start));
        if (!res) {
            Py_DECREF(object);
            return NULL;
        }
-        outp = PyBytes_AsString(res);
+        outp = (unsigned char*)PyBytes_AsString(res);
        for (i = start; i < end; i++) {
            /* object is guaranteed to be "ready" */
            Py_UCS4 ch = PyUnicode_READ_CHAR(object, i);
@ -788,9 +863,33 @@ PyCodec_SurrogatePassErrors(PyObject *exc)
                Py_DECREF(object);
                return NULL;
            }
-            *outp++ = (char)(0xe0 | (ch >> 12));
-            *outp++ = (char)(0x80 | ((ch >> 6) & 0x3f));
-            *outp++ = (char)(0x80 | (ch & 0x3f));
+            switch (code) {
+            case ENC_UTF8:
+                *outp++ = (unsigned char)(0xe0 | (ch >> 12));
+                *outp++ = (unsigned char)(0x80 | ((ch >> 6) & 0x3f));
+                *outp++ = (unsigned char)(0x80 | (ch & 0x3f));
+                break;
+            case ENC_UTF16LE:
+                *outp++ = (unsigned char) ch;
+                *outp++ = (unsigned char)(ch >> 8);
+                break;
+            case ENC_UTF16BE:
+                *outp++ = (unsigned char)(ch >> 8);
+                *outp++ = (unsigned char) ch;
+                break;
+            case ENC_UTF32LE:
+                *outp++ = (unsigned char) ch;
+                *outp++ = (unsigned char)(ch >> 8);
+                *outp++ = (unsigned char)(ch >> 16);
+                *outp++ = (unsigned char)(ch >> 24);
+                break;
+            case ENC_UTF32BE:
+                *outp++ = (unsigned char)(ch >> 24);
+                *outp++ = (unsigned char)(ch >> 16);
+                *outp++ = (unsigned char)(ch >> 8);
+                *outp++ = (unsigned char) ch;
+                break;
+            }
        }
        restuple = Py_BuildValue("(On)", res, end);
        Py_DECREF(res);
@ -802,34 +901,64 @@ PyCodec_SurrogatePassErrors(PyObject *exc)
        Py_UCS4 ch = 0;
        if (PyUnicodeDecodeError_GetStart(exc, &start))
            return NULL;
+        if (PyUnicodeDecodeError_GetEnd(exc, &end))
+            return NULL;
        if (!(object = PyUnicodeDecodeError_GetObject(exc)))
            return NULL;
        if (!(p = (unsigned char*)PyBytes_AsString(object))) {
            Py_DECREF(object);
            return NULL;
        }
+        if (!(encode = PyUnicodeDecodeError_GetEncoding(exc))) {
+            Py_DECREF(object);
+            return NULL;
+        }
+        if (!(encoding = PyUnicode_AsUTF8(encode))) {
+            Py_DECREF(object);
+            Py_DECREF(encode);
+            return NULL;
+        }
+        code = get_standard_encoding(encoding, &bytelength);
+        Py_DECREF(encode);
+
        /* Try decoding a single surrogate character. If
           there are more, let the codec call us again. */
        p += start;
-        if (PyBytes_GET_SIZE(object) - start >= 3 &&
-            (p[0] & 0xf0) == 0xe0 &&
-            (p[1] & 0xc0) == 0x80 &&
-            (p[2] & 0xc0) == 0x80) {
-            /* it's a three-byte code */
-            ch = ((p[0] & 0x0f) << 12) + ((p[1] & 0x3f) << 6) + (p[2] & 0x3f);
-            if (!Py_UNICODE_IS_SURROGATE(ch))
-                /* it's not a surrogate - fail */
-                ch = 0;
+        if (PyBytes_GET_SIZE(object) - start >= bytelength) {
+            switch (code) {
+            case ENC_UTF8:
+                if ((p[0] & 0xf0) == 0xe0 &&
+                    (p[1] & 0xc0) == 0x80 &&
+                    (p[2] & 0xc0) == 0x80) {
+                    /* it's a three-byte code */
+                    ch = ((p[0] & 0x0f) << 12) + ((p[1] & 0x3f) << 6) + (p[2] & 0x3f);
+                }
+                break;
+            case ENC_UTF16LE:
+                ch = p[1] << 8 | p[0];
+                break;
+            case ENC_UTF16BE:
+                ch = p[0] << 8 | p[1];
+                break;
+            case ENC_UTF32LE:
+                ch = (p[3] << 24) | (p[2] << 16) | (p[1] << 8) | p[0];
+                break;
+            case ENC_UTF32BE:
+                ch = (p[0] << 24) | (p[1] << 16) | (p[2] << 8) | p[3];
+                break;
+            }
        }
+
        Py_DECREF(object);
-        if (ch == 0) {
+        if (!Py_UNICODE_IS_SURROGATE(ch)) {
+            /* it's not a surrogate - fail */
            PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
            return NULL;
        }
        res = PyUnicode_FromOrdinal(ch);
        if (res == NULL)
            return NULL;
-        return Py_BuildValue("(Nn)", res, start+3);
+        return Py_BuildValue("(Nn)", res, start + bytelength);
    }
    else {
        wrong_exception_type(exc);