PyUnicode_Join(): Rewrote to use PySequence_Fast(). This doesn't do

much to reduce the size of the code, but greatly improves its clarity. It's also quicker in what's probably the most common case (the argument iterable is a list). Against it, if the iterable isn't a list or a tuple, a temp tuple is materialized containing the entire input sequence, and that's a bigger temp memory burden. Yawn.
2025-10-14 18:59:46 +00:00 · 2004-08-27 21:32:02 +00:00 · 2004-08-27 21:32:02 +00:00 · 05eba1fdc8
commit 05eba1fdc8
parent cca018356d
1 changed files with 96 additions and 126 deletions
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@ -3979,159 +3979,129 @@ PyObject *
 PyUnicode_Join(PyObject *separator, PyObject *seq)
 {
    PyObject *internal_separator = NULL;
-    Py_UNICODE *sep;
+    const Py_UNICODE *sep;
    size_t seplen;
-    PyUnicodeObject *res = NULL;
+    PyUnicodeObject *res = NULL; /* the result */
-    size_t sz;      /* # allocated bytes for string in res */
+    size_t res_alloc = 100;  /* # allocated bytes for string in res */
-    size_t reslen;  /* # used bytes */
+    size_t res_used;         /* # used bytes */
-    Py_UNICODE *p;  /* pointer to free byte in res's string area */
+    Py_UNICODE *res_p;       /* pointer to free byte in res's string area */
-    PyObject *it;   /* iterator */
+    PyObject *fseq;          /* PySequence_Fast(seq) */
    int seqlen;              /* len(fseq) -- number of items in sequence */
    const Py_UNICODE blank = ' ';
    PyObject *item;
    int i;
    PyObject *temp;
-    it = PyObject_GetIter(seq);
+    fseq = PySequence_Fast(seq, "");
-    if (it == NULL)
+    if (fseq == NULL) {
-        return NULL;
+	if (PyErr_ExceptionMatches(PyExc_TypeError))
-
+	    PyErr_Format(PyExc_TypeError,
-    item = PyIter_Next(it);
+			 "sequence expected, %.80s found",
-    if (item == NULL) {
+			 seq->ob_type->tp_name);
-        if (PyErr_Occurred())
+    	return NULL;
            goto onError;
        /* empty sequence; return u"" */
        res = _PyUnicode_New(0);
        goto Done;
    }
-    /* If this is the only item, maybe we can get out cheap. */
+    seqlen = PySequence_Fast_GET_SIZE(fseq);
-    res = (PyUnicodeObject *)item;
+    /* If empty sequence, return u"". */
-    item = PyIter_Next(it);
+    if (seqlen == 0) {
-    if (item == NULL) {
+    	res = _PyUnicode_New(0);  /* empty sequence; return u"" */
-        if (PyErr_Occurred())
+    	goto Done;
-            goto onError;
+    }
-        /* There's only one item in the sequence. */
+    /* If singleton sequence with an exact Unicode, return that. */
-        if (PyUnicode_CheckExact(res)) /* whatever.join([u]) -> u */
+    if (seqlen == 1) {
-            goto Done;
+	item = PySequence_Fast_GET_ITEM(fseq, 0);
 	if (PyUnicode_CheckExact(item)) {
 	    Py_INCREF(item);
 	    res = (PyUnicodeObject *)item;
 	    goto Done;
 	}
    }
-    /* There are at least two to join (item != NULL), or there's only
+    /* At least two items to join, or one that isn't exact Unicode. */
-     * one but it's not an exact Unicode (item == NULL).  res needs
+    if (seqlen > 1) {
-     * conversion to Unicode in either case.
+        /* Set up sep and seplen -- they're needed. */
-     * Caution:  we may need to ensure a copy is made, and that's trickier
+    	if (separator == NULL) {
-     * than it sounds because, e.g., PyUnicode_FromObject() may return
+	    sep = &blank;
-     * a shared object (which must not be mutated).
+	    seplen = 1;
-     */
+        }
-    if (! PyUnicode_Check(res) && ! PyString_Check(res)) {
+    	else {
-        PyErr_Format(PyExc_TypeError,
+	    internal_separator = PyUnicode_FromObject(separator);
-                "sequence item 0: expected string or Unicode,"
+	    if (internal_separator == NULL)
-    	        " %.80s found",
+	        goto onError;
-    	       res->ob_type->tp_name);
+	    sep = PyUnicode_AS_UNICODE(internal_separator);
-    	Py_XDECREF(item);
+	    seplen = PyUnicode_GET_SIZE(internal_separator);
        }
    }
    /* Get space. */
    res = _PyUnicode_New((int)res_alloc);
    if (res == NULL)
        goto onError;
-    }
+    res_p = PyUnicode_AS_UNICODE(res);
-    temp = PyUnicode_FromObject((PyObject *)res);
+    res_used = 0;
    if (temp == NULL) {
        Py_XDECREF(item);
        goto onError;
    }
    Py_DECREF(res);
    if (item == NULL) {
    	/* res was the only item */
        res = (PyUnicodeObject *)temp;
        goto Done;
    }
    /* There are at least two items.  As above, temp may be a shared object,
     * so we need to copy it.
     */
    reslen = PyUnicode_GET_SIZE(temp);
    sz = reslen + 100;  /* breathing room */
    if (sz < reslen || sz > INT_MAX) /* overflow -- no breathing room */
    	sz = reslen;
    res = _PyUnicode_New((int)sz);
    if (res == NULL) {
        Py_DECREF(item);
        goto onError;
    }
    p = PyUnicode_AS_UNICODE(res);
    Py_UNICODE_COPY(p, PyUnicode_AS_UNICODE(temp), (int)reslen);
    p += reslen;
    Py_DECREF(temp);
-    if (separator == NULL) {
+    for (i = 0; i < seqlen; ++i) {
-	Py_UNICODE blank = ' ';
+	size_t itemlen;
-	sep = &blank;
+	size_t new_res_used;
-	seplen = 1;
+
-    }
+	item = PySequence_Fast_GET_ITEM(fseq, i);
-    else {
+	/* Convert item to Unicode. */
-	internal_separator = PyUnicode_FromObject(separator);
+	if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
-	if (internal_separator == NULL) {
+	    PyErr_Format(PyExc_TypeError,
-	    Py_DECREF(item);
+			 "sequence item %i: expected string or Unicode,"
 			 " %.80s found",
 			 i, item->ob_type->tp_name);
 	    goto onError;
 	}
-	sep = PyUnicode_AS_UNICODE(internal_separator);
+	item = PyUnicode_FromObject(item);
-	seplen = PyUnicode_GET_SIZE(internal_separator);
+	if (item == NULL)
-    }
+	    goto onError;
 	/* We own a reference to item from here on. */
    i = 1;
    do {
 	size_t itemlen;
 	size_t newreslen;
 	/* Catenate the separator, then item. */
 	/* First convert item to Unicode. */
 	if (!PyUnicode_Check(item)) {
 	    PyObject *v;
 	    if (!PyString_Check(item)) {
 		PyErr_Format(PyExc_TypeError,
 			     "sequence item %i: expected string or Unicode,"
 			     " %.80s found",
 			     i, item->ob_type->tp_name);
 		Py_DECREF(item);
 		goto onError;
 	    }
 	    v = PyUnicode_FromObject(item);
 	    Py_DECREF(item);
 	    item = v;
 	    if (item == NULL)
 		goto onError;
 	}
        /* Make sure we have enough space for the separator and the item. */
 	itemlen = PyUnicode_GET_SIZE(item);
-	newreslen = reslen + seplen + itemlen;
+	new_res_used = res_used + itemlen;
-	if (newreslen < reslen ||  newreslen > INT_MAX)
+	if (new_res_used < res_used ||  new_res_used > INT_MAX)
 	    goto Overflow;
-	if (newreslen > sz) {
+	if (i < seqlen - 1) {
 	    new_res_used += seplen;
 	    if (new_res_used < res_used ||  new_res_used > INT_MAX)
 		goto Overflow;
 	}
 	if (new_res_used > res_alloc) {
 	    /* double allocated size until it's big enough */
 	    do {
-	        size_t oldsize = sz;
+	        size_t oldsize = res_alloc;
-	        sz += sz;
+	        res_alloc += res_alloc;
-	        if (sz < oldsize || sz > INT_MAX)
+	        if (res_alloc < oldsize || res_alloc > INT_MAX)
 	            goto Overflow;
-	    } while (newreslen > sz);
+	    } while (new_res_used > res_alloc);
-	    if (_PyUnicode_Resize(&res, (int)sz) < 0) {
+	    if (_PyUnicode_Resize(&res, (int)res_alloc) < 0) {
 		Py_DECREF(item);
 		goto onError;
 	    }
-            p = PyUnicode_AS_UNICODE(res) + reslen;
+            res_p = PyUnicode_AS_UNICODE(res) + res_used;
 	}
 	/* Copy item, and maybe the separator. */
 	Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), (int)itemlen);
 	res_p += itemlen;
 	if (i < seqlen - 1) {
 	    Py_UNICODE_COPY(res_p, sep, (int)seplen);
 	    res_p += seplen;
 	}
 	Py_UNICODE_COPY(p, sep, (int)seplen);
 	p += seplen;
 	Py_UNICODE_COPY(p, PyUnicode_AS_UNICODE(item), (int)itemlen);
 	p += itemlen;
 	Py_DECREF(item);
-	reslen = newreslen;
+	res_used = new_res_used;
    }
-        ++i;
+    /* Shrink res to match the used area; this probably can't fail,
-	item = PyIter_Next(it);
+     * but it's cheap to check.
-    } while (item != NULL);
+     */
-    if (PyErr_Occurred())
+    if (_PyUnicode_Resize(&res, (int)res_used) < 0)
 	goto onError;
    if (_PyUnicode_Resize(&res, (int)reslen) < 0)
 	goto onError;
 Done:
    Py_XDECREF(internal_separator);
-    Py_DECREF(it);
+    Py_DECREF(fseq);
    return (PyObject *)res;
 Overflow:
@ -4142,7 +4112,7 @@ PyUnicode_Join(PyObject *separator, PyObject *seq)
 onError:
    Py_XDECREF(internal_separator);
-    Py_DECREF(it);
+    Py_DECREF(fseq);
    Py_XDECREF(res);
    return NULL;
 }