Optimize backslashreplace error handler

Issue #25318: Optimize backslashreplace and xmlcharrefreplace error handlers in UTF-8 encoder. Optimize also backslashreplace error handler for ASCII and Latin1 encoders. Use the new _PyBytesWriter API to optimize these error handlers for the encoders. It avoids to create an exception and call the slow implementation of the error handler.
2025-10-17 20:28:43 +00:00 · 2015-10-09 01:39:28 +02:00 · 2015-10-09 01:39:28 +02:00 · e7bf86cd7d
commit e7bf86cd7d
parent fdfbf78114
2 changed files with 160 additions and 51 deletions
--- a/Objects/stringlib/codecs.h
+++ b/Objects/stringlib/codecs.h
@ -334,7 +334,6 @@ STRINGLIB(utf8_encoder)(PyObject *unicode,
                i += (endpos - startpos - 1);
                break;
            case _Py_ERROR_SURROGATEPASS:
                for (k=startpos; k<endpos; k++) {
                    ch = data[k];
@ -345,6 +344,22 @@ STRINGLIB(utf8_encoder)(PyObject *unicode,
                i += (endpos - startpos - 1);
                break;
            case _Py_ERROR_BACKSLASHREPLACE:
                p = backslashreplace(&writer, max_char_size, p,
                                     unicode, startpos, endpos);
                if (p == NULL)
                    goto error;
                i += (endpos - startpos - 1);
                break;
            case _Py_ERROR_XMLCHARREFREPLACE:
                p = xmlcharrefreplace(&writer, max_char_size, p,
                                      unicode, startpos, endpos);
                if (p == NULL)
                    goto error;
                i += (endpos - startpos - 1);
                break;
            case _Py_ERROR_SURROGATEESCAPE:
                for (k=startpos; k<endpos; k++) {
                    ch = data[k];
@ -359,7 +374,6 @@ STRINGLIB(utf8_encoder)(PyObject *unicode,
                startpos = k;
                assert(startpos < endpos);
                /* fall through the default handler */
            default:
                rep = unicode_encode_call_errorhandler(
                      errors, &error_handler_obj, "utf-8", "surrogates not allowed",
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@ -305,9 +305,10 @@ typedef enum {
    _Py_ERROR_UNKNOWN=0,
    _Py_ERROR_STRICT,
    _Py_ERROR_SURROGATEESCAPE,
    _Py_ERROR_SURROGATEPASS,
    _Py_ERROR_REPLACE,
    _Py_ERROR_IGNORE,
    _Py_ERROR_BACKSLASHREPLACE,
    _Py_ERROR_SURROGATEPASS,
    _Py_ERROR_XMLCHARREFREPLACE,
    _Py_ERROR_OTHER
 } _Py_error_handler;
@ -315,18 +316,18 @@ typedef enum {
 static _Py_error_handler
 get_error_handler(const char *errors)
 {
-    if (errors == NULL)
+    if (errors == NULL || strcmp(errors, "strict") == 0)
        return _Py_ERROR_STRICT;
    if (strcmp(errors, "strict") == 0)
        return _Py_ERROR_STRICT;
    if (strcmp(errors, "surrogateescape") == 0)
        return _Py_ERROR_SURROGATEESCAPE;
    if (strcmp(errors, "surrogatepass") == 0)
        return _Py_ERROR_SURROGATEPASS;
    if (strcmp(errors, "ignore") == 0)
        return _Py_ERROR_IGNORE;
    if (strcmp(errors, "replace") == 0)
        return _Py_ERROR_REPLACE;
    if (strcmp(errors, "ignore") == 0)
        return _Py_ERROR_IGNORE;
    if (strcmp(errors, "backslashreplace") == 0)
        return _Py_ERROR_BACKSLASHREPLACE;
    if (strcmp(errors, "surrogatepass") == 0)
        return _Py_ERROR_SURROGATEPASS;
    if (strcmp(errors, "xmlcharrefreplace") == 0)
        return _Py_ERROR_XMLCHARREFREPLACE;
    return _Py_ERROR_OTHER;
@ -771,6 +772,126 @@ unicode_result_unchanged(PyObject *unicode)
        return _PyUnicode_Copy(unicode);
 }
 /* Implementation of the "backslashreplace" error handler for 8-bit encodings:
   ASCII, Latin1, UTF-8, etc. */
 static char*
 backslashreplace(_PyBytesWriter *writer, Py_ssize_t prealloc_per_char,
                  char *str,
                 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
 {
    Py_ssize_t size, i, prealloc;
    Py_UCS4 ch;
    enum PyUnicode_Kind kind;
    void *data;
    assert(PyUnicode_IS_READY(unicode));
    kind = PyUnicode_KIND(unicode);
    data = PyUnicode_DATA(unicode);
    size = 0;
    /* determine replacement size */
    for (i = collstart; i < collend; ++i) {
        Py_ssize_t incr;
        ch = PyUnicode_READ(kind, data, i);
        if (ch < 0x100)
            incr = 2+2;
        else if (ch < 0x10000)
            incr = 2+4;
        else {
            assert(ch <= MAX_UNICODE);
            incr = 2+6;
        }
        if (size > PY_SSIZE_T_MAX - incr) {
            PyErr_SetString(PyExc_OverflowError,
                            "encoded result is too long for a Python string");
            return NULL;
        }
        size += incr;
    }
    prealloc = prealloc_per_char * (collend - collstart);
    if (size > prealloc) {
        str = _PyBytesWriter_Prepare(writer, str, size - prealloc);
        if (str == NULL)
            return NULL;
    }
    /* generate replacement */
    for (i = collstart; i < collend; ++i) {
        ch = PyUnicode_READ(kind, data, i);
        if (ch < 0x100)
            str += sprintf(str, "\\x%02x", ch);
        else if (ch < 0x10000)
            str += sprintf(str, "\\u%04x", ch);
        else {
            assert(ch <= MAX_UNICODE);
            str += sprintf(str, "\\U%08x", ch);
        }
    }
    return str;
 }
 /* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings:
   ASCII, Latin1, UTF-8, etc. */
 static char*
 xmlcharrefreplace(_PyBytesWriter *writer, Py_ssize_t prealloc_per_char,
                  char *str,
                  PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
 {
    Py_ssize_t size, i, prealloc;
    Py_UCS4 ch;
    enum PyUnicode_Kind kind;
    void *data;
    assert(PyUnicode_IS_READY(unicode));
    kind = PyUnicode_KIND(unicode);
    data = PyUnicode_DATA(unicode);
    size = 0;
    /* determine replacement size */
    for (i = collstart; i < collend; ++i) {
        Py_ssize_t incr;
        ch = PyUnicode_READ(kind, data, i);
        if (ch < 10)
            incr = 2+1+1;
        else if (ch < 100)
            incr = 2+2+1;
        else if (ch < 1000)
            incr = 2+3+1;
        else if (ch < 10000)
            incr = 2+4+1;
        else if (ch < 100000)
            incr = 2+5+1;
        else if (ch < 1000000)
            incr = 2+6+1;
        else {
            assert(ch <= MAX_UNICODE);
            incr = 2+7+1;
        }
        if (size > PY_SSIZE_T_MAX - incr) {
            PyErr_SetString(PyExc_OverflowError,
                            "encoded result is too long for a Python string");
            return NULL;
        }
        size += incr;
    }
    prealloc = prealloc_per_char * (collend - collstart);
    if (size > prealloc) {
        str = _PyBytesWriter_Prepare(writer, str, size - prealloc);
        if (str == NULL)
            return NULL;
    }
    /* generate replacement */
    for (i = collstart; i < collend; ++i) {
        str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
    }
    return str;
 }
 /* --- Bloom Filters ----------------------------------------------------- */
 /* stuff to implement simple "bloom filters" for Unicode characters.
@ -6713,7 +6834,6 @@ unicode_encode_ucs1(PyObject *unicode,
            ++pos;
        }
        else {
            Py_ssize_t requiredsize;
            PyObject *repunicode;
            Py_ssize_t repsize, newpos, i;
            /* startpos for collecting unencodable chars */
@ -6744,42 +6864,19 @@ unicode_encode_ucs1(PyObject *unicode,
                pos = collend;
                break;
-            case _Py_ERROR_XMLCHARREFREPLACE:
+            case _Py_ERROR_BACKSLASHREPLACE:
-                requiredsize = 0;
+                str = backslashreplace(&writer, 1, str,
-                /* determine replacement size */
+                                       unicode, collstart, collend);
                for (i = collstart; i < collend; ++i) {
                    Py_ssize_t incr;
                    ch = PyUnicode_READ(kind, data, i);
                    if (ch < 10)
                        incr = 2+1+1;
                    else if (ch < 100)
                        incr = 2+2+1;
                    else if (ch < 1000)
                        incr = 2+3+1;
                    else if (ch < 10000)
                        incr = 2+4+1;
                    else if (ch < 100000)
                        incr = 2+5+1;
                    else if (ch < 1000000)
                        incr = 2+6+1;
                    else {
                        assert(ch <= MAX_UNICODE);
                        incr = 2+7+1;
                    }
                    if (requiredsize > PY_SSIZE_T_MAX - incr)
                        goto overflow;
                    requiredsize += incr;
                }
                str = _PyBytesWriter_Prepare(&writer, str, requiredsize-1);
                if (str == NULL)
                    goto onError;
                pos = collend;
                break;
-                /* generate replacement */
+            case _Py_ERROR_XMLCHARREFREPLACE:
-                for (i = collstart; i < collend; ++i) {
+                str = xmlcharrefreplace(&writer, 1, str,
-                    str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
+                                        unicode, collstart, collend);
-                }
+                if (str == NULL)
                    goto onError;
                pos = collend;
                break;
@ -6810,9 +6907,11 @@ unicode_encode_ucs1(PyObject *unicode,
                if (PyBytes_Check(repunicode)) {
                    /* Directly copy bytes result to output. */
                    repsize = PyBytes_Size(repunicode);
-                    str = _PyBytesWriter_Prepare(&writer, str, repsize-1);
+                    if (repsize > 1) {
-                    if (str == NULL)
+                        str = _PyBytesWriter_Prepare(&writer, str, repsize-1);
-                        goto onError;
+                        if (str == NULL)
                            goto onError;
                    }
                    memcpy(str, PyBytes_AsString(repunicode), repsize);
                    str += repsize;
                    pos = newpos;
@ -6856,10 +6955,6 @@ unicode_encode_ucs1(PyObject *unicode,
    Py_XDECREF(exc);
    return _PyBytesWriter_Finish(&writer, str);
  overflow:
    PyErr_SetString(PyExc_OverflowError,
                    "encoded result is too long for a Python string");
  onError:
    _PyBytesWriter_Dealloc(&writer);
    Py_XDECREF(error_handler_obj);