Add rsplit method for str and unicode builtin types.

SF feature request #801847. Original patch is written by Sean Reifschneider.
2025-08-04 00:48:58 +00:00 · 2003-12-15 18:49:53 +00:00 · 2003-12-15 18:49:53 +00:00 · 3ae811b57d
commit 3ae811b57d
parent dce391cb39
7 changed files with 402 additions and 1 deletions
--- a/Doc/lib/libstdtypes.tex
+++ b/Doc/lib/libstdtypes.tex
@ -694,6 +694,24 @@ The original string is returned if
 \versionchanged[Support for the \var{fillchar} argument]{2.4}
 \end{methoddesc}
 \begin{methoddesc}[string]{rsplit}{\optional{, sep\optional{, maxsplit}}}
 Return a list of the words of the string, scanning the string from
 the end working forward.  The resulting list of words is in the
 same order as \function{split()}.  If the optional second argument
 \var{sep} is absent or \code{None}, the words are separated by
 arbitrary strings of whitespace characters (space, tab, newline,
 return, formfeed).  If the second argument \var{sep} is present and
 not \code{None}, it specifies a string to be used as the word
 separator.  The returned list will then have one more item than the
 number of non-overlapping occurrences of the separator in the string.
 The optional third argument \var{maxsplit} defaults to 0.  If it
 is nonzero, at most \var{maxsplit} number of splits occur, and the
 remainder of the string is returned as the first element of the
 list (thus, the list will have at most \code{\var{maxsplit}+1}
 elements).
 \versionadded{2.4}
 \end{methoddesc}
 \begin{methoddesc}[string]{rstrip}{\optional{chars}}
 Return a copy of the string with trailing characters removed.  If
 \var{chars} is omitted or \code{None}, whitespace characters are
--- a/Doc/lib/libstring.tex
+++ b/Doc/lib/libstring.tex
@ -215,6 +215,23 @@ The functions defined in this module are:
  elements).
 \end{funcdesc}
 \begin{funcdesc}{rsplit}{s\optional{, sep\optional{, maxsplit}}}
  Return a list of the words of the string \var{s}, scanning \var{s} from
  the end working forward.  The resulting list of words is in the same
  order as \function{split()}.  If the optional second argument \var{sep}
  is absent or \code{None}, the words are separated by arbitrary strings
  of whitespace characters (space, tab, newline, return, formfeed).
  If the second argument \var{sep} is present and not \code{None}, it
  specifies a string to be used as the word separator.  The returned
  list will then have one more item than the number of non-overlapping
  occurrences of the separator in the string.  The optional third argument
  \var{maxsplit} defaults to 0.  If it is nonzero, at most \var{maxsplit}
  number of splits occur, and the remainder of the string is returned
  as the first element of the list (thus, the list will have at most
  \code{\var{maxsplit}+1} elements).
  \versionadded{2.4}
 \end{funcdesc}
 \begin{funcdesc}{splitfields}{s\optional{, sep\optional{, maxsplit}}}
  This function behaves identically to \function{split()}.  (In the
  past, \function{split()} was only used with one argument, while
--- a/Include/unicodeobject.h
+++ b/Include/unicodeobject.h
@ -185,6 +185,7 @@ typedef PY_UNICODE_TYPE Py_UNICODE;
 # define PyUnicode_Resize PyUnicodeUCS2_Resize
 # define PyUnicode_SetDefaultEncoding PyUnicodeUCS2_SetDefaultEncoding
 # define PyUnicode_Split PyUnicodeUCS2_Split
 # define PyUnicode_RSplit PyUnicodeUCS2_RSplit
 # define PyUnicode_Splitlines PyUnicodeUCS2_Splitlines
 # define PyUnicode_Tailmatch PyUnicodeUCS2_Tailmatch
 # define PyUnicode_Translate PyUnicodeUCS2_Translate
@ -959,6 +960,25 @@ PyAPI_FUNC(PyObject*) PyUnicode_Splitlines(
    int keepends		/* If true, line end markers are included */
    );		
 /* Split a string giving a list of Unicode strings.
   If sep is NULL, splitting will be done at all whitespace
   substrings. Otherwise, splits occur at the given separator.
   At most maxsplit splits will be done. But unlike PyUnicode_Split
   PyUnicode_RSplit splits from the end of the string. If negative,
   no limit is set.
   Separators are not included in the resulting list.
 */
 PyAPI_FUNC(PyObject*) PyUnicode_RSplit(
    PyObject *s,		/* String to split */
    PyObject *sep,		/* String separator */
    int maxsplit		/* Maxsplit count */
    );		
 /* Translate a string by applying a character mapping table to it and
   return the resulting Unicode object.
--- a/Lib/string.py
+++ b/Lib/string.py
@ -121,6 +121,18 @@ def split(s, sep=None, maxsplit=-1):
    return s.split(sep, maxsplit)
 splitfields = split
 # Split a string into a list of space/tab-separated words
 def rsplit(s, sep=None, maxsplit=-1):
    """rsplit(s [,sep [,maxsplit]]) -> list of strings
    Return a list of the words in the string s, using sep as the
    delimiter string, starting at the end of the string and working
    to the front.  If maxsplit is given, at most maxsplit splits are
    done. If sep is not specified or is None, any whitespace string
    is a separator.
    """
    return s.rsplit(sep, maxsplit)
 # Join fields with optional separator
 def join(words, sep = ' '):
    """join(list [,sep]) -> string
--- a/Lib/test/string_tests.py
+++ b/Lib/test/string_tests.py
@ -189,6 +189,26 @@ class CommonTest(unittest.TestCase):
        self.checkraises(TypeError, 'hello', 'split', 42, 42, 42)
    def test_rsplit(self):
        self.checkequal(['this', 'is', 'the', 'rsplit', 'function'],
                         'this is the rsplit function', 'rsplit')
        self.checkequal(['a', 'b', 'c', 'd'], 'a|b|c|d', 'rsplit', '|')
        self.checkequal(['a|b', 'c', 'd'], 'a|b|c|d', 'rsplit', '|', 2)
        self.checkequal(['a b c', 'd'], 'a b c d', 'rsplit', None, 1)
        self.checkequal(['a b', 'c', 'd'], 'a b c d', 'rsplit', None, 2)
        self.checkequal(['a', 'b', 'c', 'd'], 'a b c d', 'rsplit', None, 3)
        self.checkequal(['a', 'b', 'c', 'd'], 'a b c d', 'rsplit', None, 4)
        self.checkequal(['a b c d'], 'a b c d', 'rsplit', None, 0)
        self.checkequal(['a, b, c', 'd'], 'a, b, c, d', 'rsplit', ', ', 1)
        self.checkequal(['a, b', 'c', 'd'], 'a, b, c, d', 'rsplit', ', ', 2)
        self.checkequal(['a', 'b', 'c', 'd'], 'a, b, c, d', 'rsplit', ', ', 3)
        self.checkequal(['a', 'b', 'c', 'd'], 'a, b, c, d', 'rsplit', ', ', 4)
        self.checkequal(['a, b, c, d'], 'a, b, c, d', 'rsplit', ', ', 0)
        self.checkequal(['a  b', 'c', 'd'], 'a  b  c  d', 'rsplit', None, 2)
        self.checkequal(['a\x00b', 'c'], 'a\x00b\x00c', 'rsplit', '\x00', 1)
        self.checkequal(['', ''], 'abcd', 'rsplit', 'abcd')
        self.checkequal([u'a b', u'c', u'd'], 'a b c d', 'rsplit', u' ', 2)
    def test_strip(self):
        self.checkequal('hello', '   hello   ', 'strip')
        self.checkequal('hello   ', '   hello   ', 'lstrip')
--- a/Objects/stringobject.c
+++ b/Objects/stringobject.c
@ -1407,6 +1407,129 @@ string_split(PyStringObject *self, PyObject *args)
 	return NULL;
 }
 static PyObject *
 rsplit_whitespace(const char *s, int len, int maxsplit)
 {
 	int i, j, err;
 	PyObject* item;
 	PyObject *list = PyList_New(0);
 	if (list == NULL)
 		return NULL;
 	for (i = j = len - 1; i >= 0; ) {
 		while (i >= 0 && isspace(Py_CHARMASK(s[i])))
 			i--;
 		j = i;
 		while (i >= 0 && !isspace(Py_CHARMASK(s[i])))
 			i--;
 		if (j > i) {
 			if (maxsplit-- <= 0)
 				break;
 			item = PyString_FromStringAndSize(s+i+1, (int)(j-i));
 			if (item == NULL)
 				goto finally;
 			err = PyList_Insert(list, 0, item);
 			Py_DECREF(item);
 			if (err < 0)
 				goto finally;
 			while (i >= 0 && isspace(Py_CHARMASK(s[i])))
 				i--;
 			j = i;
 		}
 	}
 	if (j >= 0) {
 		item = PyString_FromStringAndSize(s, (int)(j + 1));
 		if (item == NULL)
 			goto finally;
 		err = PyList_Insert(list, 0, item);
 		Py_DECREF(item);
 		if (err < 0)
 			goto finally;
 	}
 	return list;
  finally:
 	Py_DECREF(list);
 	return NULL;
 }
 PyDoc_STRVAR(rsplit__doc__,
 "S.rsplit([sep [,maxsplit]]) -> list of strings\n\
 \n\
 Return a list of the words in the string S, using sep as the\n\
 delimiter string, starting at the end of the string and working\n\
 to the front.  If maxsplit is given, at most maxsplit splits are\n\
 done. If sep is not specified or is None, any whitespace string\n\
 is a separator.");
 static PyObject *
 string_rsplit(PyStringObject *self, PyObject *args)
 {
 	int len = PyString_GET_SIZE(self), n, i, j, err;
 	int maxsplit = -1;
 	const char *s = PyString_AS_STRING(self), *sub;
 	PyObject *list, *item, *subobj = Py_None;
 	if (!PyArg_ParseTuple(args, "|Oi:rsplit", &subobj, &maxsplit))
 		return NULL;
 	if (maxsplit < 0)
 		maxsplit = INT_MAX;
 	if (subobj == Py_None)
 		return rsplit_whitespace(s, len, maxsplit);
 	if (PyString_Check(subobj)) {
 		sub = PyString_AS_STRING(subobj);
 		n = PyString_GET_SIZE(subobj);
 	}
 #ifdef Py_USING_UNICODE
 	else if (PyUnicode_Check(subobj))
 		return PyUnicode_RSplit((PyObject *)self, subobj, maxsplit);
 #endif
 	else if (PyObject_AsCharBuffer(subobj, &sub, &n))
 		return NULL;
 	if (n == 0) {
 		PyErr_SetString(PyExc_ValueError, "empty separator");
 		return NULL;
 	}
 	list = PyList_New(0);
 	if (list == NULL)
 		return NULL;
 	j = len;
 	i = j - n;
 	while (i >= 0) {
 		if (s[i] == sub[0] && memcmp(s+i, sub, n) == 0) {
 			if (maxsplit-- <= 0)
 				break;
 			item = PyString_FromStringAndSize(s+i+n, (int)(j-i-n));
 			if (item == NULL)
 				goto fail;
 			err = PyList_Insert(list, 0, item);
 			Py_DECREF(item);
 			if (err < 0)
 				goto fail;
 			j = i;
 			i -= n;
 		}
 		else
 			i--;
 	}
 	item = PyString_FromStringAndSize(s, j);
 	if (item == NULL)
 		goto fail;
 	err = PyList_Insert(list, 0, item);
 	Py_DECREF(item);
 	if (err < 0)
 		goto fail;
 	return list;
 fail:
 	Py_DECREF(list);
 	return NULL;
 }
 PyDoc_STRVAR(join__doc__,
 "S.join(sequence) -> string\n\
@ -3064,6 +3187,7 @@ string_methods[] = {
 	   string.maketrans(). */
 	{"join", (PyCFunction)string_join, METH_O, join__doc__},
 	{"split", (PyCFunction)string_split, METH_VARARGS, split__doc__},
 	{"rsplit", (PyCFunction)string_rsplit, METH_VARARGS, rsplit__doc__},
 	{"lower", (PyCFunction)string_lower, METH_NOARGS, lower__doc__},
 	{"upper", (PyCFunction)string_upper, METH_NOARGS, upper__doc__},
 	{"islower", (PyCFunction)string_islower, METH_NOARGS, islower__doc__},
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@ -4053,7 +4053,7 @@ PyUnicodeObject *pad(PyUnicodeObject *self,
 }
 #define SPLIT_APPEND(data, left, right)					\
-	str = PyUnicode_FromUnicode(data + left, right - left);		\
+	str = PyUnicode_FromUnicode((data) + (left), (right) - (left));	\
 	if (!str)							\
 	    goto onError;						\
 	if (PyList_Append(list, str)) {					\
@ -4063,6 +4063,17 @@ PyUnicodeObject *pad(PyUnicodeObject *self,
        else								\
            Py_DECREF(str);
 #define SPLIT_INSERT(data, left, right)					\
 	str = PyUnicode_FromUnicode((data) + (left), (right) - (left));	\
 	if (!str)							\
 	    goto onError;						\
 	if (PyList_Insert(list, 0, str)) {				\
 	    Py_DECREF(str);						\
 	    goto onError;						\
 	}								\
        else								\
            Py_DECREF(str);
 static
 PyObject *split_whitespace(PyUnicodeObject *self,
 			   PyObject *list,
@ -4214,7 +4225,106 @@ PyObject *split_substring(PyUnicodeObject *self,
    return NULL;
 }
 static
 PyObject *rsplit_whitespace(PyUnicodeObject *self,
 			    PyObject *list,
 			    int maxcount)
 {
    register int i;
    register int j;
    int len = self->length;
    PyObject *str;
    for (i = j = len - 1; i >= 0; ) {
 	/* find a token */
 	while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
 	    i--;
 	j = i;
 	while (i >= 0 && !Py_UNICODE_ISSPACE(self->str[i]))
 	    i--;
 	if (j > i) {
 	    if (maxcount-- <= 0)
 		break;
 	    SPLIT_INSERT(self->str, i + 1, j + 1);
 	    while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
 		i--;
 	    j = i;
 	}
    }
    if (j >= 0) {
 	SPLIT_INSERT(self->str, 0, j + 1);
    }
    return list;
 onError:
    Py_DECREF(list);
    return NULL;
 }
 static 
 PyObject *rsplit_char(PyUnicodeObject *self,
 		      PyObject *list,
 		      Py_UNICODE ch,
 		      int maxcount)
 {
    register int i;
    register int j;
    int len = self->length;
    PyObject *str;
    for (i = j = len - 1; i >= 0; ) {
 	if (self->str[i] == ch) {
 	    if (maxcount-- <= 0)
 		break;
 	    SPLIT_INSERT(self->str, i + 1, j + 1);
 	    j = i = i - 1;
 	} else
 	    i--;
    }
    if (j >= 0) {
 	SPLIT_INSERT(self->str, 0, j + 1);
    }
    return list;
 onError:
    Py_DECREF(list);
    return NULL;
 }
 static 
 PyObject *rsplit_substring(PyUnicodeObject *self,
 			   PyObject *list,
 			   PyUnicodeObject *substring,
 			   int maxcount)
 {
    register int i;
    register int j;
    int len = self->length;
    int sublen = substring->length;
    PyObject *str;
    for (i = len - sublen, j = len; i >= 0; ) {
 	if (Py_UNICODE_MATCH(self, i, substring)) {
 	    if (maxcount-- <= 0)
 		break;
 	    SPLIT_INSERT(self->str, i + sublen, j);
 	    j = i;
 	    i -= sublen;
 	} else
 	    i--;
    }
    if (j >= 0) {
 	SPLIT_INSERT(self->str, 0, j);
    }
    return list;
 onError:
    Py_DECREF(list);
    return NULL;
 }
 #undef SPLIT_APPEND
 #undef SPLIT_INSERT
 static
 PyObject *split(PyUnicodeObject *self,
@ -4245,6 +4355,35 @@ PyObject *split(PyUnicodeObject *self,
 	return split_substring(self,list,substring,maxcount);
 }
 static
 PyObject *rsplit(PyUnicodeObject *self,
 		 PyUnicodeObject *substring,
 		 int maxcount)
 {
    PyObject *list;
    if (maxcount < 0)
        maxcount = INT_MAX;
    list = PyList_New(0);
    if (!list)
        return NULL;
    if (substring == NULL)
 	return rsplit_whitespace(self,list,maxcount);
    else if (substring->length == 1)
 	return rsplit_char(self,list,substring->str[0],maxcount);
    else if (substring->length == 0) {
 	Py_DECREF(list);
 	PyErr_SetString(PyExc_ValueError, "empty separator");
 	return NULL;
    }
    else
 	return rsplit_substring(self,list,substring,maxcount);
 }
 static
 PyObject *replace(PyUnicodeObject *self,
 		  PyUnicodeObject *str1,
@ -5675,6 +5814,56 @@ unicode_split(PyUnicodeObject *self, PyObject *args)
 	return PyUnicode_Split((PyObject *)self, substring, maxcount);
 }
 PyObject *PyUnicode_RSplit(PyObject *s,
 			   PyObject *sep,
 			   int maxsplit)
 {
    PyObject *result;
    s = PyUnicode_FromObject(s);
    if (s == NULL)
 	return NULL;
    if (sep != NULL) {
 	sep = PyUnicode_FromObject(sep);
 	if (sep == NULL) {
 	    Py_DECREF(s);
 	    return NULL;
 	}
    }
    result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
    Py_DECREF(s);
    Py_XDECREF(sep);
    return result;
 }
 PyDoc_STRVAR(rsplit__doc__,
 "S.rsplit([sep [,maxsplit]]) -> list of strings\n\
 \n\
 Return a list of the words in S, using sep as the\n\
 delimiter string, starting at the end of the string and\n\
 working to the front.  If maxsplit is given, at most maxsplit\n\
 splits are done. If sep is not specified, any whitespace string\n\
 is a separator.");
 static PyObject*
 unicode_rsplit(PyUnicodeObject *self, PyObject *args)
 {
    PyObject *substring = Py_None;
    int maxcount = -1;
    if (!PyArg_ParseTuple(args, "|Oi:rsplit", &substring, &maxcount))
        return NULL;
    if (substring == Py_None)
 	return rsplit(self, NULL, maxcount);
    else if (PyUnicode_Check(substring))
 	return rsplit(self, (PyUnicodeObject *)substring, maxcount);
    else
 	return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
 }
 PyDoc_STRVAR(splitlines__doc__,
 "S.splitlines([keepends]]) -> list of strings\n\
 \n\
@ -5870,6 +6059,7 @@ static PyMethodDef unicode_methods[] = {
    {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
    {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
    {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
    {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
    {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
    {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
    {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},