mirror of
				https://github.com/python/cpython.git
				synced 2025-10-26 16:27:06 +00:00 
			
		
		
		
	
		
			
				
	
	
		
			390 lines
		
	
	
	
		
			11 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
			
		
		
	
	
			390 lines
		
	
	
	
		
			11 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
| /* stringlib: split implementation */
 | |
| 
 | |
| #ifndef STRINGLIB_FASTSEARCH_H
 | |
| #error must include "stringlib/fastsearch.h" before including this module
 | |
| #endif
 | |
| 
 | |
| /* Overallocate the initial list to reduce the number of reallocs for small
 | |
|    split sizes.  Eg, "A A A A A A A A A A".split() (10 elements) has three
 | |
|    resizes, to sizes 4, 8, then 16.  Most observed string splits are for human
 | |
|    text (roughly 11 words per line) and field delimited data (usually 1-10
 | |
|    fields).  For large strings the split algorithms are bandwidth limited
 | |
|    so increasing the preallocation likely will not improve things.*/
 | |
| 
 | |
| #define MAX_PREALLOC 12
 | |
| 
 | |
| /* 5 splits gives 6 elements */
 | |
| #define PREALLOC_SIZE(maxsplit) \
 | |
|     (maxsplit >= MAX_PREALLOC ? MAX_PREALLOC : maxsplit+1)
 | |
| 
 | |
| #define SPLIT_APPEND(data, left, right)         \
 | |
|     sub = STRINGLIB_NEW((data) + (left),        \
 | |
|                         (right) - (left));      \
 | |
|     if (sub == NULL)                            \
 | |
|         goto onError;                           \
 | |
|     if (PyList_Append(list, sub)) {             \
 | |
|         Py_DECREF(sub);                         \
 | |
|         goto onError;                           \
 | |
|     }                                           \
 | |
|     else                                        \
 | |
|         Py_DECREF(sub);
 | |
| 
 | |
| #define SPLIT_ADD(data, left, right) {          \
 | |
|     sub = STRINGLIB_NEW((data) + (left),        \
 | |
|                         (right) - (left));      \
 | |
|     if (sub == NULL)                            \
 | |
|         goto onError;                           \
 | |
|     if (count < MAX_PREALLOC) {                 \
 | |
|         PyList_SET_ITEM(list, count, sub);      \
 | |
|     } else {                                    \
 | |
|         if (PyList_Append(list, sub)) {         \
 | |
|             Py_DECREF(sub);                     \
 | |
|             goto onError;                       \
 | |
|         }                                       \
 | |
|         else                                    \
 | |
|             Py_DECREF(sub);                     \
 | |
|     }                                           \
 | |
|     count++; }
 | |
| 
 | |
| 
 | |
| /* Always force the list to the expected size. */
 | |
| #define FIX_PREALLOC_SIZE(list) Py_SIZE(list) = count
 | |
| 
 | |
| Py_LOCAL_INLINE(PyObject *)
 | |
| STRINGLIB(split_whitespace)(PyObject* str_obj,
 | |
|                            const STRINGLIB_CHAR* str, Py_ssize_t str_len,
 | |
|                            Py_ssize_t maxcount)
 | |
| {
 | |
|     Py_ssize_t i, j, count=0;
 | |
|     PyObject *list = PyList_New(PREALLOC_SIZE(maxcount));
 | |
|     PyObject *sub;
 | |
| 
 | |
|     if (list == NULL)
 | |
|         return NULL;
 | |
| 
 | |
|     i = j = 0;
 | |
|     while (maxcount-- > 0) {
 | |
|         while (i < str_len && STRINGLIB_ISSPACE(str[i]))
 | |
|             i++;
 | |
|         if (i == str_len) break;
 | |
|         j = i; i++;
 | |
|         while (i < str_len && !STRINGLIB_ISSPACE(str[i]))
 | |
|             i++;
 | |
| #ifndef STRINGLIB_MUTABLE
 | |
|         if (j == 0 && i == str_len && STRINGLIB_CHECK_EXACT(str_obj)) {
 | |
|             /* No whitespace in str_obj, so just use it as list[0] */
 | |
|             Py_INCREF(str_obj);
 | |
|             PyList_SET_ITEM(list, 0, (PyObject *)str_obj);
 | |
|             count++;
 | |
|             break;
 | |
|         }
 | |
| #endif
 | |
|         SPLIT_ADD(str, j, i);
 | |
|     }
 | |
| 
 | |
|     if (i < str_len) {
 | |
|         /* Only occurs when maxcount was reached */
 | |
|         /* Skip any remaining whitespace and copy to end of string */
 | |
|         while (i < str_len && STRINGLIB_ISSPACE(str[i]))
 | |
|             i++;
 | |
|         if (i != str_len)
 | |
|             SPLIT_ADD(str, i, str_len);
 | |
|     }
 | |
|     FIX_PREALLOC_SIZE(list);
 | |
|     return list;
 | |
| 
 | |
|   onError:
 | |
|     Py_DECREF(list);
 | |
|     return NULL;
 | |
| }
 | |
| 
 | |
| Py_LOCAL_INLINE(PyObject *)
 | |
| STRINGLIB(split_char)(PyObject* str_obj,
 | |
|                      const STRINGLIB_CHAR* str, Py_ssize_t str_len,
 | |
|                      const STRINGLIB_CHAR ch,
 | |
|                      Py_ssize_t maxcount)
 | |
| {
 | |
|     Py_ssize_t i, j, count=0;
 | |
|     PyObject *list = PyList_New(PREALLOC_SIZE(maxcount));
 | |
|     PyObject *sub;
 | |
| 
 | |
|     if (list == NULL)
 | |
|         return NULL;
 | |
| 
 | |
|     i = j = 0;
 | |
|     while ((j < str_len) && (maxcount-- > 0)) {
 | |
|         for(; j < str_len; j++) {
 | |
|             /* I found that using memchr makes no difference */
 | |
|             if (str[j] == ch) {
 | |
|                 SPLIT_ADD(str, i, j);
 | |
|                 i = j = j + 1;
 | |
|                 break;
 | |
|             }
 | |
|         }
 | |
|     }
 | |
| #ifndef STRINGLIB_MUTABLE
 | |
|     if (count == 0 && STRINGLIB_CHECK_EXACT(str_obj)) {
 | |
|         /* ch not in str_obj, so just use str_obj as list[0] */
 | |
|         Py_INCREF(str_obj);
 | |
|         PyList_SET_ITEM(list, 0, (PyObject *)str_obj);
 | |
|         count++;
 | |
|     } else
 | |
| #endif
 | |
|     if (i <= str_len) {
 | |
|         SPLIT_ADD(str, i, str_len);
 | |
|     }
 | |
|     FIX_PREALLOC_SIZE(list);
 | |
|     return list;
 | |
| 
 | |
|   onError:
 | |
|     Py_DECREF(list);
 | |
|     return NULL;
 | |
| }
 | |
| 
 | |
| Py_LOCAL_INLINE(PyObject *)
 | |
| STRINGLIB(split)(PyObject* str_obj,
 | |
|                 const STRINGLIB_CHAR* str, Py_ssize_t str_len,
 | |
|                 const STRINGLIB_CHAR* sep, Py_ssize_t sep_len,
 | |
|                 Py_ssize_t maxcount)
 | |
| {
 | |
|     Py_ssize_t i, j, pos, count=0;
 | |
|     PyObject *list, *sub;
 | |
| 
 | |
|     if (sep_len == 0) {
 | |
|         PyErr_SetString(PyExc_ValueError, "empty separator");
 | |
|         return NULL;
 | |
|     }
 | |
|     else if (sep_len == 1)
 | |
|         return STRINGLIB(split_char)(str_obj, str, str_len, sep[0], maxcount);
 | |
| 
 | |
|     list = PyList_New(PREALLOC_SIZE(maxcount));
 | |
|     if (list == NULL)
 | |
|         return NULL;
 | |
| 
 | |
|     i = j = 0;
 | |
|     while (maxcount-- > 0) {
 | |
|         pos = FASTSEARCH(str+i, str_len-i, sep, sep_len, -1, FAST_SEARCH);
 | |
|         if (pos < 0)
 | |
|             break;
 | |
|         j = i + pos;
 | |
|         SPLIT_ADD(str, i, j);
 | |
|         i = j + sep_len;
 | |
|     }
 | |
| #ifndef STRINGLIB_MUTABLE
 | |
|     if (count == 0 && STRINGLIB_CHECK_EXACT(str_obj)) {
 | |
|         /* No match in str_obj, so just use it as list[0] */
 | |
|         Py_INCREF(str_obj);
 | |
|         PyList_SET_ITEM(list, 0, (PyObject *)str_obj);
 | |
|         count++;
 | |
|     } else
 | |
| #endif
 | |
|     {
 | |
|         SPLIT_ADD(str, i, str_len);
 | |
|     }
 | |
|     FIX_PREALLOC_SIZE(list);
 | |
|     return list;
 | |
| 
 | |
|   onError:
 | |
|     Py_DECREF(list);
 | |
|     return NULL;
 | |
| }
 | |
| 
 | |
| Py_LOCAL_INLINE(PyObject *)
 | |
| STRINGLIB(rsplit_whitespace)(PyObject* str_obj,
 | |
|                             const STRINGLIB_CHAR* str, Py_ssize_t str_len,
 | |
|                             Py_ssize_t maxcount)
 | |
| {
 | |
|     Py_ssize_t i, j, count=0;
 | |
|     PyObject *list = PyList_New(PREALLOC_SIZE(maxcount));
 | |
|     PyObject *sub;
 | |
| 
 | |
|     if (list == NULL)
 | |
|         return NULL;
 | |
| 
 | |
|     i = j = str_len - 1;
 | |
|     while (maxcount-- > 0) {
 | |
|         while (i >= 0 && STRINGLIB_ISSPACE(str[i]))
 | |
|             i--;
 | |
|         if (i < 0) break;
 | |
|         j = i; i--;
 | |
|         while (i >= 0 && !STRINGLIB_ISSPACE(str[i]))
 | |
|             i--;
 | |
| #ifndef STRINGLIB_MUTABLE
 | |
|         if (j == str_len - 1 && i < 0 && STRINGLIB_CHECK_EXACT(str_obj)) {
 | |
|             /* No whitespace in str_obj, so just use it as list[0] */
 | |
|             Py_INCREF(str_obj);
 | |
|             PyList_SET_ITEM(list, 0, (PyObject *)str_obj);
 | |
|             count++;
 | |
|             break;
 | |
|         }
 | |
| #endif
 | |
|         SPLIT_ADD(str, i + 1, j + 1);
 | |
|     }
 | |
| 
 | |
|     if (i >= 0) {
 | |
|         /* Only occurs when maxcount was reached */
 | |
|         /* Skip any remaining whitespace and copy to beginning of string */
 | |
|         while (i >= 0 && STRINGLIB_ISSPACE(str[i]))
 | |
|             i--;
 | |
|         if (i >= 0)
 | |
|             SPLIT_ADD(str, 0, i + 1);
 | |
|     }
 | |
|     FIX_PREALLOC_SIZE(list);
 | |
|     if (PyList_Reverse(list) < 0)
 | |
|         goto onError;
 | |
|     return list;
 | |
| 
 | |
|   onError:
 | |
|     Py_DECREF(list);
 | |
|     return NULL;
 | |
| }
 | |
| 
 | |
| Py_LOCAL_INLINE(PyObject *)
 | |
| STRINGLIB(rsplit_char)(PyObject* str_obj,
 | |
|                       const STRINGLIB_CHAR* str, Py_ssize_t str_len,
 | |
|                       const STRINGLIB_CHAR ch,
 | |
|                       Py_ssize_t maxcount)
 | |
| {
 | |
|     Py_ssize_t i, j, count=0;
 | |
|     PyObject *list = PyList_New(PREALLOC_SIZE(maxcount));
 | |
|     PyObject *sub;
 | |
| 
 | |
|     if (list == NULL)
 | |
|         return NULL;
 | |
| 
 | |
|     i = j = str_len - 1;
 | |
|     while ((i >= 0) && (maxcount-- > 0)) {
 | |
|         for(; i >= 0; i--) {
 | |
|             if (str[i] == ch) {
 | |
|                 SPLIT_ADD(str, i + 1, j + 1);
 | |
|                 j = i = i - 1;
 | |
|                 break;
 | |
|             }
 | |
|         }
 | |
|     }
 | |
| #ifndef STRINGLIB_MUTABLE
 | |
|     if (count == 0 && STRINGLIB_CHECK_EXACT(str_obj)) {
 | |
|         /* ch not in str_obj, so just use str_obj as list[0] */
 | |
|         Py_INCREF(str_obj);
 | |
|         PyList_SET_ITEM(list, 0, (PyObject *)str_obj);
 | |
|         count++;
 | |
|     } else
 | |
| #endif
 | |
|     if (j >= -1) {
 | |
|         SPLIT_ADD(str, 0, j + 1);
 | |
|     }
 | |
|     FIX_PREALLOC_SIZE(list);
 | |
|     if (PyList_Reverse(list) < 0)
 | |
|         goto onError;
 | |
|     return list;
 | |
| 
 | |
|   onError:
 | |
|     Py_DECREF(list);
 | |
|     return NULL;
 | |
| }
 | |
| 
 | |
| Py_LOCAL_INLINE(PyObject *)
 | |
| STRINGLIB(rsplit)(PyObject* str_obj,
 | |
|                  const STRINGLIB_CHAR* str, Py_ssize_t str_len,
 | |
|                  const STRINGLIB_CHAR* sep, Py_ssize_t sep_len,
 | |
|                  Py_ssize_t maxcount)
 | |
| {
 | |
|     Py_ssize_t j, pos, count=0;
 | |
|     PyObject *list, *sub;
 | |
| 
 | |
|     if (sep_len == 0) {
 | |
|         PyErr_SetString(PyExc_ValueError, "empty separator");
 | |
|         return NULL;
 | |
|     }
 | |
|     else if (sep_len == 1)
 | |
|         return STRINGLIB(rsplit_char)(str_obj, str, str_len, sep[0], maxcount);
 | |
| 
 | |
|     list = PyList_New(PREALLOC_SIZE(maxcount));
 | |
|     if (list == NULL)
 | |
|         return NULL;
 | |
| 
 | |
|     j = str_len;
 | |
|     while (maxcount-- > 0) {
 | |
|         pos = FASTSEARCH(str, j, sep, sep_len, -1, FAST_RSEARCH);
 | |
|         if (pos < 0)
 | |
|             break;
 | |
|         SPLIT_ADD(str, pos + sep_len, j);
 | |
|         j = pos;
 | |
|     }
 | |
| #ifndef STRINGLIB_MUTABLE
 | |
|     if (count == 0 && STRINGLIB_CHECK_EXACT(str_obj)) {
 | |
|         /* No match in str_obj, so just use it as list[0] */
 | |
|         Py_INCREF(str_obj);
 | |
|         PyList_SET_ITEM(list, 0, (PyObject *)str_obj);
 | |
|         count++;
 | |
|     } else
 | |
| #endif
 | |
|     {
 | |
|         SPLIT_ADD(str, 0, j);
 | |
|     }
 | |
|     FIX_PREALLOC_SIZE(list);
 | |
|     if (PyList_Reverse(list) < 0)
 | |
|         goto onError;
 | |
|     return list;
 | |
| 
 | |
|   onError:
 | |
|     Py_DECREF(list);
 | |
|     return NULL;
 | |
| }
 | |
| 
 | |
| Py_LOCAL_INLINE(PyObject *)
 | |
| STRINGLIB(splitlines)(PyObject* str_obj,
 | |
|                      const STRINGLIB_CHAR* str, Py_ssize_t str_len,
 | |
|                      int keepends)
 | |
| {
 | |
|     /* This does not use the preallocated list because splitlines is
 | |
|        usually run with hundreds of newlines.  The overhead of
 | |
|        switching between PyList_SET_ITEM and append causes about a
 | |
|        2-3% slowdown for that common case.  A smarter implementation
 | |
|        could move the if check out, so the SET_ITEMs are done first
 | |
|        and the appends only done when the prealloc buffer is full.
 | |
|        That's too much work for little gain.*/
 | |
| 
 | |
|     Py_ssize_t i;
 | |
|     Py_ssize_t j;
 | |
|     PyObject *list = PyList_New(0);
 | |
|     PyObject *sub;
 | |
| 
 | |
|     if (list == NULL)
 | |
|         return NULL;
 | |
| 
 | |
|     for (i = j = 0; i < str_len; ) {
 | |
|         Py_ssize_t eol;
 | |
| 
 | |
|         /* Find a line and append it */
 | |
|         while (i < str_len && !STRINGLIB_ISLINEBREAK(str[i]))
 | |
|             i++;
 | |
| 
 | |
|         /* Skip the line break reading CRLF as one line break */
 | |
|         eol = i;
 | |
|         if (i < str_len) {
 | |
|             if (str[i] == '\r' && i + 1 < str_len && str[i+1] == '\n')
 | |
|                 i += 2;
 | |
|             else
 | |
|                 i++;
 | |
|             if (keepends)
 | |
|                 eol = i;
 | |
|         }
 | |
| #ifndef STRINGLIB_MUTABLE
 | |
|         if (j == 0 && eol == str_len && STRINGLIB_CHECK_EXACT(str_obj)) {
 | |
|             /* No linebreak in str_obj, so just use it as list[0] */
 | |
|             if (PyList_Append(list, str_obj))
 | |
|                 goto onError;
 | |
|             break;
 | |
|         }
 | |
| #endif
 | |
|         SPLIT_APPEND(str, j, eol);
 | |
|         j = i;
 | |
|     }
 | |
|     return list;
 | |
| 
 | |
|   onError:
 | |
|     Py_DECREF(list);
 | |
|     return NULL;
 | |
| }
 | |
| 
 | 
