Issue #3672: Reject surrogates in utf-8 codec; add surrogates error

handler.
2025-11-17 01:25:57 +00:00 · 2009-05-02 18:52:14 +00:00 · 2009-05-02 18:52:14 +00:00 · db12d454e6
commit db12d454e6
parent 02953d244f
9 changed files with 202 additions and 21 deletions
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@ -154,6 +154,11 @@ const unsigned char _Py_ascii_whitespace[] = {
    0, 0, 0, 0, 0, 0, 0, 0
 };

+static PyObject *unicode_encode_call_errorhandler(const char *errors,
+       PyObject **errorHandler,const char *encoding, const char *reason,
+       const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
+       Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
+
 /* Same for linebreaks */
 static unsigned char ascii_linebreak[] = {
    0, 0, 0, 0, 0, 0, 0, 0,
@ -2214,14 +2219,7 @@ PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
                goto utf8Error;
            }
            ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
-            if (ch < 0x0800) {
-                /* Note: UTF-8 encodings of surrogates are considered
-                   legal UTF-8 sequences;
-
-                   XXX For wide builds (UCS-4) we should probably try
-                   to recombine the surrogates into a single code
-                   unit.
-                */
+            if (ch < 0x0800 || (ch >= 0xd800 && ch <= 0xDFFF)) {
                errmsg = "illegal encoding";
                startinpos = s-starts;
                endinpos = startinpos+3;
@ -2328,6 +2326,8 @@ PyUnicode_EncodeUTF8(const Py_UNICODE *s,
    Py_ssize_t nallocated;      /* number of result bytes allocated */
    Py_ssize_t nneeded;            /* number of result bytes needed */
    char stackbuf[MAX_SHORT_UNICHARS * 4];
+    PyObject *errorHandler = NULL;
+    PyObject *exc = NULL;

    assert(s != NULL);
    assert(size >= 0);
@ -2367,6 +2367,7 @@ PyUnicode_EncodeUTF8(const Py_UNICODE *s,
        else {
            /* Encode UCS2 Unicode ordinals */
            if (ch < 0x10000) {
+#ifndef Py_UNICODE_WIDE
                /* Special case: check for high surrogate */
                if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
                    Py_UCS4 ch2 = s[i];
@ -2379,6 +2380,36 @@ PyUnicode_EncodeUTF8(const Py_UNICODE *s,
                    }
                    /* Fall through: handles isolated high surrogates */
                }
+#endif
+                if (ch >= 0xd800 && ch <= 0xdfff) {
+                    Py_ssize_t newpos;
+                    PyObject *rep;
+                    char *prep;
+                    int k;
+                    rep = unicode_encode_call_errorhandler
+                        (errors, &errorHandler, "utf-8", "surrogates not allowed", 
+                         s, size, &exc, i-1, i, &newpos);
+                    if (!rep)
+                        goto error;
+                    /* Implementation limitations: only support error handler that return
+                       bytes, and only support up to four replacement bytes. */
+                    if (!PyBytes_Check(rep)) {
+                        PyErr_SetString(PyExc_TypeError, "error handler should have returned bytes");
+                        Py_DECREF(rep);
+                        goto error;
+                    }
+                    if (PyBytes_Size(rep) > 4) {
+                        PyErr_SetString(PyExc_TypeError, "error handler returned too many bytes");
+                        Py_DECREF(rep);
+                        goto error;
+                    }
+                    prep = PyBytes_AsString(rep);
+                    for(k = PyBytes_Size(rep); k > 0; k--)
+                        *p++ = *prep++;
+                    Py_DECREF(rep);
+                    continue;
+                    
+                }
                *p++ = (char)(0xe0 | (ch >> 12));
                *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
                *p++ = (char)(0x80 | (ch & 0x3f));
@ -2405,7 +2436,14 @@ PyUnicode_EncodeUTF8(const Py_UNICODE *s,
        assert(nneeded <= nallocated);
        _PyBytes_Resize(&result, nneeded);
    }
+    Py_XDECREF(errorHandler);
+    Py_XDECREF(exc);
    return result;
+ error:
+    Py_XDECREF(errorHandler);
+    Py_XDECREF(exc);
+    Py_XDECREF(result);
+    return NULL;

 #undef MAX_SHORT_UNICHARS
 }
@ -3897,7 +3935,7 @@ static PyObject *unicode_encode_call_errorhandler(const char *errors,
                                                  Py_ssize_t startpos, Py_ssize_t endpos,
                                                  Py_ssize_t *newpos)
 {
-    static char *argparse = "O!n;encoding error handler must return (str, int) tuple";
+    static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";

    PyObject *restuple;
    PyObject *resunicode;
@ -3918,15 +3956,20 @@ static PyObject *unicode_encode_call_errorhandler(const char *errors,
    if (restuple == NULL)
        return NULL;
    if (!PyTuple_Check(restuple)) {
-        PyErr_SetString(PyExc_TypeError, &argparse[4]);
+        PyErr_SetString(PyExc_TypeError, &argparse[3]);
        Py_DECREF(restuple);
        return NULL;
    }
-    if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
+    if (!PyArg_ParseTuple(restuple, argparse,
                          &resunicode, newpos)) {
        Py_DECREF(restuple);
        return NULL;
    }
+    if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
+        PyErr_SetString(PyExc_TypeError, &argparse[3]);
+        Py_DECREF(restuple);
+        return NULL;
+    }
    if (*newpos<0)
        *newpos = size+*newpos;
    if (*newpos<0 || *newpos>size) {
@ -4064,6 +4107,12 @@ static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
                                                              collstart-startp, collend-startp, &newpos);
                if (repunicode == NULL)
                    goto onError;
+                if (!PyUnicode_Check(repunicode)) {
+                    /* Implementation limitation: byte results not supported yet. */
+                    PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
+                    Py_DECREF(repunicode);
+                    goto onError;
+                }
                /* need more space? (at least enough for what we
                   have+the replacement+the rest of the string, so
                   we won't have to check space for encodable characters) */
@ -5027,6 +5076,12 @@ int charmap_encoding_error(
                                                      collstartpos, collendpos, &newpos);
        if (repunicode == NULL)
            return -1;
+        if (!PyUnicode_Check(repunicode)) {
+            /* Implementation limitation: byte results not supported yet. */
+            PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
+            Py_DECREF(repunicode);
+            return -1;
+        }
        /* generate replacement  */
        repsize = PyUnicode_GET_SIZE(repunicode);
        for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
@ -5588,6 +5643,12 @@ int PyUnicode_EncodeDecimal(Py_UNICODE *s,
                                                          collstart-s, collend-s, &newpos);
            if (repunicode == NULL)
                goto onError;
+            if (!PyUnicode_Check(repunicode)) {
+                /* Implementation limitation: byte results not supported yet. */
+                PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
+                Py_DECREF(repunicode);
+                goto onError;
+            }
            /* generate replacement  */
            repsize = PyUnicode_GET_SIZE(repunicode);
            for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {