Optimize _PyUnicode_FastCopyCharacters() when maxchar(from) > maxchar(to)

2025-07-24 11:44:31 +00:00 · 2012-06-16 02:22:37 +02:00 · 2012-06-16 02:22:37 +02:00 · c9d369f1bf
commit c9d369f1bf
parent f05e17ece9
2 changed files with 79 additions and 57 deletions
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@ -1128,7 +1128,6 @@ _copy_characters(PyObject *to, Py_ssize_t to_start,
 {
    unsigned int from_kind, to_kind;
    void *from_data, *to_data;
    int fast;
    assert(0 <= how_many);
    assert(0 <= from_start);
@ -1137,41 +1136,40 @@ _copy_characters(PyObject *to, Py_ssize_t to_start,
    assert(PyUnicode_IS_READY(from));
    assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
    if (how_many == 0)
        return 0;
    assert(PyUnicode_Check(to));
    assert(PyUnicode_IS_READY(to));
    assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
    if (how_many == 0)
        return 0;
    from_kind = PyUnicode_KIND(from);
    from_data = PyUnicode_DATA(from);
    to_kind = PyUnicode_KIND(to);
    to_data = PyUnicode_DATA(to);
-#ifdef Py_DEBUG
+    if (from_kind == to_kind) {
-    if (!check_maxchar
+        if (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)) {
-        && (from_kind > to_kind
+            /* Writing Latin-1 characters into an ASCII string requires to
-            || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))))
+               check that all written characters are pure ASCII */
-    {
+#ifndef Py_DEBUG
-        const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
+            if (check_maxchar) {
-        Py_UCS4 ch;
+                Py_UCS4 max_char;
-        Py_ssize_t i;
+                max_char = ucs1lib_find_max_char(from_data,
-        for (i=0; i < how_many; i++) {
+                                                 (char*)from_data + how_many);
-            ch = PyUnicode_READ(from_kind, from_data, from_start + i);
+                if (max_char >= 128)
-            assert(ch <= to_maxchar);
+                    return -1;
-        }
+            }
-    }
+#else
            const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
            Py_UCS4 ch;
            Py_ssize_t i;
            for (i=0; i < how_many; i++) {
                ch = PyUnicode_READ(from_kind, from_data, from_start + i);
                assert(ch <= to_maxchar);
            }
 #endif
-    fast = (from_kind == to_kind);
+        }
    if (check_maxchar
        && (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
    {
        /* deny latin1 => ascii */
        fast = 0;
    }
    if (fast) {
        Py_MEMCPY((char*)to_data + to_kind * to_start,
                  (char*)from_data + from_kind * from_start,
                  to_kind * how_many);
@ -1207,42 +1205,62 @@ _copy_characters(PyObject *to, Py_ssize_t to_start,
            );
    }
    else {
-        /* check if max_char(from substring) <= max_char(to) */
+        assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
-        if (from_kind > to_kind
+
-                /* latin1 => ascii */
+#ifndef Py_DEBUG
-            || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
+        if (!check_maxchar) {
            if (from_kind == PyUnicode_2BYTE_KIND
                && to_kind == PyUnicode_1BYTE_KIND)
            {
                _PyUnicode_CONVERT_BYTES(
                    Py_UCS2, Py_UCS1,
                    PyUnicode_2BYTE_DATA(from) + from_start,
                    PyUnicode_2BYTE_DATA(from) + from_start + how_many,
                    PyUnicode_1BYTE_DATA(to) + to_start
                    );
            }
            else if (from_kind == PyUnicode_4BYTE_KIND
                     && to_kind == PyUnicode_1BYTE_KIND)
            {
                _PyUnicode_CONVERT_BYTES(
                    Py_UCS4, Py_UCS1,
                    PyUnicode_4BYTE_DATA(from) + from_start,
                    PyUnicode_4BYTE_DATA(from) + from_start + how_many,
                    PyUnicode_1BYTE_DATA(to) + to_start
                    );
            }
            else if (from_kind == PyUnicode_4BYTE_KIND
                     && to_kind == PyUnicode_2BYTE_KIND)
            {
                _PyUnicode_CONVERT_BYTES(
                    Py_UCS4, Py_UCS2,
                    PyUnicode_4BYTE_DATA(from) + from_start,
                    PyUnicode_4BYTE_DATA(from) + from_start + how_many,
                    PyUnicode_2BYTE_DATA(to) + to_start
                    );
            }
            else {
                assert(0);
                return -1;
            }
        }
        else
 #endif
        {
            /* slow path to check for character overflow */
            const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
            Py_UCS4 ch;
            Py_ssize_t i;
 #ifdef Py_DEBUG
            for (i=0; i < how_many; i++) {
                ch = PyUnicode_READ(from_kind, from_data, from_start + i);
 #ifndef Py_DEBUG
                assert(ch <= to_maxchar);
 #else
                if (ch > to_maxchar)
                    return -1;
 #endif
                PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
            }
 #else
            if (!check_maxchar) {
                for (i=0; i < how_many; i++) {
                    ch = PyUnicode_READ(from_kind, from_data, from_start + i);
                    PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
                }
            }
            else {
                for (i=0; i < how_many; i++) {
                    ch = PyUnicode_READ(from_kind, from_data, from_start + i);
                    if (ch > to_maxchar)
                        return 1;
                    PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
                }
            }
 #endif
        }
        else {
            assert(0 && "inconsistent state");
            return 1;
        }
    }
    return 0;
@ -13876,9 +13894,11 @@ PyUnicode_Format(PyObject *format, PyObject *args)
                }
            }
-            _PyUnicode_FastCopyCharacters(writer.buffer, writer.pos,
+            if (len) {
-                                          temp, pindex, len);
+                _PyUnicode_FastCopyCharacters(writer.buffer, writer.pos,
-            writer.pos += len;
+                                              temp, pindex, len);
                writer.pos += len;
            }
            if (width > len) {
                sublen = width - len;
                FILL(writer.kind, writer.data, ' ', writer.pos, sublen);
--- a/Python/formatter_unicode.c
+++ b/Python/formatter_unicode.c
@ -786,8 +786,10 @@ format_string_internal(PyObject *value, const InternalFormatSpec *format,
        goto done;
    /* Then the source string. */
-    _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
+    if (len) {
-                                  value, 0, len);
+        _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
                                      value, 0, len);
    }
    writer->pos += (len + rpad);
    result = 0;