Merged revisions 77461 via svnmerge from

svn+ssh://pythondev@svn.python.org/python/trunk ........ r77461 | antoine.pitrou | 2010-01-13 08:55:48 +0100 (mer., 13 janv. 2010) | 5 lines Issue #7622: Improve the split(), rsplit(), splitlines() and replace() methods of bytes, bytearray and unicode objects by using a common implementation based on stringlib's fast search. Patch by Florent Xicluna. ........
2025-10-13 18:33:34 +00:00 · 2010-01-13 08:07:53 +00:00 · 2010-01-13 08:07:53 +00:00 · f2c5484f9e
commit f2c5484f9e
parent 5efea0430c
16 changed files with 1123 additions and 1418 deletions
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@ -210,7 +210,8 @@ PyUnicode_GetMax(void)

 static BLOOM_MASK bloom_linebreak;

-#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
+#define BLOOM_ADD(mask, ch) ((mask |= (1 << ((ch) & (LONG_BIT - 1)))))
+#define BLOOM(mask, ch)     ((mask &  (1 << ((ch) & (LONG_BIT - 1)))))

 #define BLOOM_LINEBREAK(ch)                                             \
    ((ch) < 128U ? ascii_linebreak[(ch)] :                              \
@ -225,7 +226,7 @@ Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)

    mask = 0;
    for (i = 0; i < len; i++)
-        mask |= (1 << (ptr[i] & 0x1F));
+        BLOOM_ADD(mask, ptr[i]);

    return mask;
 }
@ -5873,28 +5874,30 @@ int PyUnicode_EncodeDecimal(Py_UNICODE *s,

 #include "stringlib/unicodedefs.h"
 #include "stringlib/fastsearch.h"
+
 #include "stringlib/count.h"
-/* Include _ParseTupleFinds from find.h */
-#define FROM_UNICODE
 #include "stringlib/find.h"
 #include "stringlib/partition.h"
+#include "stringlib/split.h"

 #define _Py_InsertThousandsGrouping _PyUnicode_InsertThousandsGrouping
 #define _Py_InsertThousandsGroupingLocale _PyUnicode_InsertThousandsGroupingLocale
 #include "stringlib/localeutil.h"

 /* helper macro to fixup start/end slice values */
-#define FIX_START_END(obj)                      \
-    if (start < 0)                              \
-        start += (obj)->length;                 \
-    if (start < 0)                              \
-        start = 0;                              \
-    if (end > (obj)->length)                    \
-        end = (obj)->length;                    \
-    if (end < 0)                                \
-        end += (obj)->length;                   \
-    if (end < 0)                                \
-        end = 0;
+#define ADJUST_INDICES(start, end, len)         \
+    if (end > len)                              \
+        end = len;                              \
+    else if (end < 0) {                         \
+        end += len;                             \
+        if (end < 0)                            \
+            end = 0;                            \
+    }                                           \
+    if (start < 0) {                            \
+        start += len;                           \
+        if (start < 0)                          \
+            start = 0;                          \
+    }

 Py_ssize_t PyUnicode_Count(PyObject *str,
                           PyObject *substr,
@ -5914,10 +5917,10 @@ Py_ssize_t PyUnicode_Count(PyObject *str,
        return -1;
    }

-    FIX_START_END(str_obj);
-
+    ADJUST_INDICES(start, end, str_obj->length);
    result = stringlib_count(
-        str_obj->str + start, end - start, sub_obj->str, sub_obj->length
+        str_obj->str + start, end - start, sub_obj->str, sub_obj->length,
+        PY_SSIZE_T_MAX
        );

    Py_DECREF(sub_obj);
@ -5972,8 +5975,7 @@ int tailmatch(PyUnicodeObject *self,
    if (substring->length == 0)
        return 1;

-    FIX_START_END(self);
-
+    ADJUST_INDICES(start, end, self->length);
    end -= substring->length;
    if (end < start)
        return 0;
@ -6314,305 +6316,40 @@ PyUnicodeObject *pad(PyUnicodeObject *self,
    return u;
 }

-#define SPLIT_APPEND(data, left, right)                                 \
-    str = PyUnicode_FromUnicode((data) + (left), (right) - (left));     \
-    if (!str)                                                           \
-        goto onError;                                                   \
-    if (PyList_Append(list, str)) {                                     \
-        Py_DECREF(str);                                                 \
-        goto onError;                                                   \
-    }                                                                   \
-    else                                                                \
-        Py_DECREF(str);
-
-static
-PyObject *split_whitespace(PyUnicodeObject *self,
-                           PyObject *list,
-                           Py_ssize_t maxcount)
+PyObject *PyUnicode_Splitlines(PyObject *string, int keepends)
 {
-    register Py_ssize_t i;
-    register Py_ssize_t j;
-    Py_ssize_t len = self->length;
-    PyObject *str;
-    register const Py_UNICODE *buf = self->str;
-
-    for (i = j = 0; i < len; ) {
-        /* find a token */
-        while (i < len && Py_UNICODE_ISSPACE(buf[i]))
-            i++;
-        j = i;
-        while (i < len && !Py_UNICODE_ISSPACE(buf[i]))
-            i++;
-        if (j < i) {
-            if (maxcount-- <= 0)
-                break;
-            SPLIT_APPEND(buf, j, i);
-            while (i < len && Py_UNICODE_ISSPACE(buf[i]))
-                i++;
-            j = i;
-        }
-    }
-    if (j < len) {
-        SPLIT_APPEND(buf, j, len);
-    }
-    return list;
-
-  onError:
-    Py_DECREF(list);
-    return NULL;
-}
-
-PyObject *PyUnicode_Splitlines(PyObject *string,
-                               int keepends)
-{
-    register Py_ssize_t i;
-    register Py_ssize_t j;
-    Py_ssize_t len;
    PyObject *list;
-    PyObject *str;
-    Py_UNICODE *data;

    string = PyUnicode_FromObject(string);
    if (string == NULL)
        return NULL;
-    data = PyUnicode_AS_UNICODE(string);
-    len = PyUnicode_GET_SIZE(string);

-    list = PyList_New(0);
-    if (!list)
-        goto onError;
-
-    for (i = j = 0; i < len; ) {
-        Py_ssize_t eol;
-
-        /* Find a line and append it */
-        while (i < len && !BLOOM_LINEBREAK(data[i]))
-            i++;
-
-        /* Skip the line break reading CRLF as one line break */
-        eol = i;
-        if (i < len) {
-            if (data[i] == '\r' && i + 1 < len &&
-                data[i+1] == '\n')
-                i += 2;
-            else
-                i++;
-            if (keepends)
-                eol = i;
-        }
-        SPLIT_APPEND(data, j, eol);
-        j = i;
-    }
-    if (j < len) {
-        SPLIT_APPEND(data, j, len);
-    }
+    list = stringlib_splitlines(
+        (PyObject*) string, PyUnicode_AS_UNICODE(string),
+        PyUnicode_GET_SIZE(string), keepends);

    Py_DECREF(string);
    return list;
-
-  onError:
-    Py_XDECREF(list);
-    Py_DECREF(string);
-    return NULL;
 }

-static
-PyObject *split_char(PyUnicodeObject *self,
-                     PyObject *list,
-                     Py_UNICODE ch,
-                     Py_ssize_t maxcount)
-{
-    register Py_ssize_t i;
-    register Py_ssize_t j;
-    Py_ssize_t len = self->length;
-    PyObject *str;
-    register const Py_UNICODE *buf = self->str;
-
-    for (i = j = 0; i < len; ) {
-        if (buf[i] == ch) {
-            if (maxcount-- <= 0)
-                break;
-            SPLIT_APPEND(buf, j, i);
-            i = j = i + 1;
-        } else
-            i++;
-    }
-    if (j <= len) {
-        SPLIT_APPEND(buf, j, len);
-    }
-    return list;
-
-  onError:
-    Py_DECREF(list);
-    return NULL;
-}
-
-static
-PyObject *split_substring(PyUnicodeObject *self,
-                          PyObject *list,
-                          PyUnicodeObject *substring,
-                          Py_ssize_t maxcount)
-{
-    register Py_ssize_t i;
-    register Py_ssize_t j;
-    Py_ssize_t len = self->length;
-    Py_ssize_t sublen = substring->length;
-    PyObject *str;
-
-    for (i = j = 0; i <= len - sublen; ) {
-        if (Py_UNICODE_MATCH(self, i, substring)) {
-            if (maxcount-- <= 0)
-                break;
-            SPLIT_APPEND(self->str, j, i);
-            i = j = i + sublen;
-        } else
-            i++;
-    }
-    if (j <= len) {
-        SPLIT_APPEND(self->str, j, len);
-    }
-    return list;
-
-  onError:
-    Py_DECREF(list);
-    return NULL;
-}
-
-static
-PyObject *rsplit_whitespace(PyUnicodeObject *self,
-                            PyObject *list,
-                            Py_ssize_t maxcount)
-{
-    register Py_ssize_t i;
-    register Py_ssize_t j;
-    Py_ssize_t len = self->length;
-    PyObject *str;
-    register const Py_UNICODE *buf = self->str;
-
-    for (i = j = len - 1; i >= 0; ) {
-        /* find a token */
-        while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
-            i--;
-        j = i;
-        while (i >= 0 && !Py_UNICODE_ISSPACE(buf[i]))
-            i--;
-        if (j > i) {
-            if (maxcount-- <= 0)
-                break;
-            SPLIT_APPEND(buf, i + 1, j + 1);
-            while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
-                i--;
-            j = i;
-        }
-    }
-    if (j >= 0) {
-        SPLIT_APPEND(buf, 0, j + 1);
-    }
-    if (PyList_Reverse(list) < 0)
-        goto onError;
-    return list;
-
-  onError:
-    Py_DECREF(list);
-    return NULL;
-}
-
-static
-PyObject *rsplit_char(PyUnicodeObject *self,
-                      PyObject *list,
-                      Py_UNICODE ch,
-                      Py_ssize_t maxcount)
-{
-    register Py_ssize_t i;
-    register Py_ssize_t j;
-    Py_ssize_t len = self->length;
-    PyObject *str;
-    register const Py_UNICODE *buf = self->str;
-
-    for (i = j = len - 1; i >= 0; ) {
-        if (buf[i] == ch) {
-            if (maxcount-- <= 0)
-                break;
-            SPLIT_APPEND(buf, i + 1, j + 1);
-            j = i = i - 1;
-        } else
-            i--;
-    }
-    if (j >= -1) {
-        SPLIT_APPEND(buf, 0, j + 1);
-    }
-    if (PyList_Reverse(list) < 0)
-        goto onError;
-    return list;
-
-  onError:
-    Py_DECREF(list);
-    return NULL;
-}
-
-static
-PyObject *rsplit_substring(PyUnicodeObject *self,
-                           PyObject *list,
-                           PyUnicodeObject *substring,
-                           Py_ssize_t maxcount)
-{
-    register Py_ssize_t i;
-    register Py_ssize_t j;
-    Py_ssize_t len = self->length;
-    Py_ssize_t sublen = substring->length;
-    PyObject *str;
-
-    for (i = len - sublen, j = len; i >= 0; ) {
-        if (Py_UNICODE_MATCH(self, i, substring)) {
-            if (maxcount-- <= 0)
-                break;
-            SPLIT_APPEND(self->str, i + sublen, j);
-            j = i;
-            i -= sublen;
-        } else
-            i--;
-    }
-    if (j >= 0) {
-        SPLIT_APPEND(self->str, 0, j);
-    }
-    if (PyList_Reverse(list) < 0)
-        goto onError;
-    return list;
-
-  onError:
-    Py_DECREF(list);
-    return NULL;
-}
-
-#undef SPLIT_APPEND
-
 static
 PyObject *split(PyUnicodeObject *self,
                PyUnicodeObject *substring,
                Py_ssize_t maxcount)
 {
-    PyObject *list;
-
    if (maxcount < 0)
        maxcount = PY_SSIZE_T_MAX;

-    list = PyList_New(0);
-    if (!list)
-        return NULL;
-
    if (substring == NULL)
-        return split_whitespace(self,list,maxcount);
+        return stringlib_split_whitespace(
+            (PyObject*) self,  self->str, self->length, maxcount
+            );

-    else if (substring->length == 1)
-        return split_char(self,list,substring->str[0],maxcount);
-
-    else if (substring->length == 0) {
-        Py_DECREF(list);
-        PyErr_SetString(PyExc_ValueError, "empty separator");
-        return NULL;
-    }
-    else
-        return split_substring(self,list,substring,maxcount);
+    return stringlib_split(
+        (PyObject*) self,  self->str, self->length,
+        substring->str, substring->length,
+        maxcount
+        );
 }

 static
@ -6620,28 +6357,19 @@ PyObject *rsplit(PyUnicodeObject *self,
                 PyUnicodeObject *substring,
                 Py_ssize_t maxcount)
 {
-    PyObject *list;
-
    if (maxcount < 0)
        maxcount = PY_SSIZE_T_MAX;

-    list = PyList_New(0);
-    if (!list)
-        return NULL;
-
    if (substring == NULL)
-        return rsplit_whitespace(self,list,maxcount);
+        return stringlib_rsplit_whitespace(
+            (PyObject*) self,  self->str, self->length, maxcount
+            );

-    else if (substring->length == 1)
-        return rsplit_char(self,list,substring->str[0],maxcount);
-
-    else if (substring->length == 0) {
-        Py_DECREF(list);
-        PyErr_SetString(PyExc_ValueError, "empty separator");
-        return NULL;
-    }
-    else
-        return rsplit_substring(self,list,substring,maxcount);
+    return stringlib_rsplit(
+        (PyObject*) self,  self->str, self->length,
+        substring->str, substring->length,
+        maxcount
+        );
 }

 static
@ -6654,9 +6382,13 @@ PyObject *replace(PyUnicodeObject *self,

    if (maxcount < 0)
        maxcount = PY_SSIZE_T_MAX;
+    else if (maxcount == 0 || self->length == 0)
+        goto nothing;

    if (str1->length == str2->length) {
        /* same length */
+        if (str1->length == 0)
+            goto nothing;
        Py_ssize_t i;
        if (str1->length == 1) {
            /* replace characters */
@ -6676,8 +6408,8 @@ PyObject *replace(PyUnicodeObject *self,
                    u->str[i] = u2;
                }
        } else {
-            i = fastsearch(
-                self->str, self->length, str1->str, str1->length, FAST_SEARCH
+            i = stringlib_find(
+                self->str, self->length, str1->str, str1->length, 0
                );
            if (i < 0)
                goto nothing;
@ -6685,14 +6417,20 @@ PyObject *replace(PyUnicodeObject *self,
            if (!u)
                return NULL;
            Py_UNICODE_COPY(u->str, self->str, self->length);
-            while (i <= self->length - str1->length)
-                if (Py_UNICODE_MATCH(self, i, str1)) {
-                    if (--maxcount < 0)
-                        break;
-                    Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
-                    i += str1->length;
-                } else
-                    i++;
+
+            /* change everything in-place, starting with this one */
+            Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
+            i += str1->length;
+
+            while ( --maxcount > 0) {
+                i = stringlib_find(self->str+i, self->length-i,
+                                   str1->str, str1->length,
+                                   i);
+                if (i == -1)
+                    break;
+                Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
+                i += str1->length;
+            }
        }
    } else {

@ -6701,9 +6439,8 @@ PyObject *replace(PyUnicodeObject *self,
        Py_UNICODE *p;

        /* replace strings */
-        n = stringlib_count(self->str, self->length, str1->str, str1->length);
-        if (n > maxcount)
-            n = maxcount;
+        n = stringlib_count(self->str, self->length, str1->str, str1->length,
+                            maxcount);
        if (n == 0)
            goto nothing;
        /* new_size = self->length + n * (str2->length - str1->length)); */
@ -6733,15 +6470,12 @@ PyObject *replace(PyUnicodeObject *self,
        if (str1->length > 0) {
            while (n-- > 0) {
                /* look for next match */
-                j = i;
-                while (j <= e) {
-                    if (Py_UNICODE_MATCH(self, j, str1))
-                        break;
-                    j++;
-                }
-                if (j > i) {
-                    if (j > e)
-                        break;
+                j = stringlib_find(self->str+i, self->length-i,
+                                   str1->str, str1->length,
+                                   i);
+                if (j == -1)
+                    break;
+                else if (j > i) {
                    /* copy unchanged part [i:j] */
                    Py_UNICODE_COPY(p, self->str+i, j-i);
                    p += j - i;
@ -7192,11 +6926,11 @@ unicode_count(PyUnicodeObject *self, PyObject *args)
    if (substring == NULL)
        return NULL;

-    FIX_START_END(self);
-
+    ADJUST_INDICES(start, end, self->length);
    result = PyLong_FromSsize_t(
        stringlib_count(self->str + start, end - start,
-                        substring->str, substring->length)
+                        substring->str, substring->length,
+                        PY_SSIZE_T_MAX)
        );

    Py_DECREF(substring);
@ -10066,11 +9800,3 @@ Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
 #ifdef __cplusplus
 }
 #endif
-
-
-/*
-  Local variables:
-  c-basic-offset: 4
-  indent-tabs-mode: nil
-  End:
-*/