Merging the py3k-pep3137 branch back into the py3k branch.

No detailed change log; just check out the change log for the py3k-pep3137 branch. The most obvious changes: - str8 renamed to bytes (PyString at the C level); - bytes renamed to buffer (PyBytes at the C level); - PyString and PyUnicode are no longer compatible. I.e. we now have an immutable bytes type and a mutable bytes type. The behavior of PyString was modified quite a bit, to make it more bytes-like. Some changes are still on the to-do list.
2025-11-03 03:22:27 +00:00 · 2007-11-06 21:34:58 +00:00 · 2007-11-06 21:34:58 +00:00 · 98297ee781
commit 98297ee781
parent a19f80c6df
148 changed files with 2533 additions and 3517 deletions
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@ -101,7 +101,7 @@ extern "C" {
   function will delete the reference from this dictionary.

   Another way to look at this is that to say that the actual reference
-   count of a string is:  s->ob_refcnt + (s->ob_sstate?2:0)
+   count of a string is:  s->ob_refcnt + (s->state ? 2 : 0)
 */
 static PyObject *interned;

@ -998,7 +998,10 @@ PyObject *PyUnicode_FromObject(register PyObject *obj)
 	return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
 				     PyUnicode_GET_SIZE(obj));
    }
-    return PyUnicode_FromEncodedObject(obj, NULL, "strict");
+    PyErr_Format(PyExc_TypeError,
+                 "Can't convert '%.100s' object to str implicitly",
+                 Py_Type(obj)->tp_name);
+    return NULL;
 }

 PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
@ -1219,22 +1222,7 @@ PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
    v = PyCodec_Encode(unicode, encoding, errors);
    if (v == NULL)
        goto onError;
-    if (!PyBytes_Check(v)) {
-        if (PyString_Check(v)) {
-            /* Old codec, turn it into bytes */
-            PyObject *b = PyBytes_FromObject(v);
-            Py_DECREF(v);
-            return b;
-        }
-        PyErr_Format(PyExc_TypeError,
-                     "encoder did not return a bytes object "
-                     "(type=%.400s, encoding=%.20s, errors=%.20s)",
-                     v->ob_type->tp_name,
-                     encoding ? encoding : "NULL",
-                     errors ? errors : "NULL");
-        Py_DECREF(v);
-        goto onError;
-    }
+    assert(PyString_Check(v));
    return v;

 onError:
@ -1245,19 +1233,15 @@ PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
 					    const char *errors)
 {
    PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
-    PyObject *b;
    if (v)
        return v;
    if (errors != NULL)
        Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString");
-    b = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
+    v = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
                             PyUnicode_GET_SIZE(unicode),
                             NULL);
-    if (!b)
+    if (!v)
        return NULL;
-    v = PyString_FromStringAndSize(PyBytes_AsString(b),
-                                   PyBytes_Size(b));
-    Py_DECREF(b);
    ((PyUnicodeObject *)unicode)->defenc = v;
    return v;
 }
@ -1420,11 +1404,11 @@ int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler
    inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
    if (!inputobj)
        goto onError;
-    if (!PyBytes_Check(inputobj)) {
+    if (!PyString_Check(inputobj)) {
 	PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
    }
-    *input = PyBytes_AS_STRING(inputobj);
-    insize = PyBytes_GET_SIZE(inputobj);
+    *input = PyString_AS_STRING(inputobj);
+    insize = PyString_GET_SIZE(inputobj);
    *inend = *input + insize;
    /* we can DECREF safely, as the exception has another reference,
       so the object won't go away. */
@ -1674,7 +1658,7 @@ PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
                   int encodeWhiteSpace,
                   const char *errors)
 {
-    PyObject *v;
+    PyObject *v, *result;
    /* It might be possible to tighten this worst case */
    Py_ssize_t cbAllocated = 5 * size;
    int inShift = 0;
@ -1685,7 +1669,7 @@ PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
    char * start;

    if (size == 0)
-	return PyBytes_FromStringAndSize(NULL, 0);
+	return PyString_FromStringAndSize(NULL, 0);

    v = PyBytes_FromStringAndSize(NULL, cbAllocated);
    if (v == NULL)
@ -1757,11 +1741,9 @@ PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
        *out++ = '-';
    }

-    if (PyBytes_Resize(v, out - start)) {
-        Py_DECREF(v);
-        return NULL;
-    }
-    return v;
+    result = PyString_FromStringAndSize(PyBytes_AS_STRING(v), out - start);
+    Py_DECREF(v);
+    return result;
 }

 #undef SPECIAL
@ -2001,11 +1983,11 @@ PyUnicode_EncodeUTF8(const Py_UNICODE *s,
 {
 #define MAX_SHORT_UNICHARS 300  /* largest size we'll do on the stack */

-    Py_ssize_t i;           /* index into s of next input byte */
-    PyObject *v;        /* result string object */
-    char *p;            /* next free byte in output buffer */
-    Py_ssize_t nallocated;  /* number of result bytes allocated */
-    Py_ssize_t nneeded;        /* number of result bytes needed */
+    Py_ssize_t i;                /* index into s of next input byte */
+    PyObject *result;            /* result string object */
+    char *p;                     /* next free byte in output buffer */
+    Py_ssize_t nallocated;      /* number of result bytes allocated */
+    Py_ssize_t nneeded;            /* number of result bytes needed */
    char stackbuf[MAX_SHORT_UNICHARS * 4];

    assert(s != NULL);
@ -2017,7 +1999,7 @@ PyUnicode_EncodeUTF8(const Py_UNICODE *s,
         * turns out we need.
         */
        nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
-        v = NULL;   /* will allocate after we're done */
+        result = NULL;   /* will allocate after we're done */
        p = stackbuf;
    }
    else {
@ -2025,10 +2007,10 @@ PyUnicode_EncodeUTF8(const Py_UNICODE *s,
        nallocated = size * 4;
        if (nallocated / 4 != size)  /* overflow! */
            return PyErr_NoMemory();
-        v = PyBytes_FromStringAndSize(NULL, nallocated);
-        if (v == NULL)
+        result = PyString_FromStringAndSize(NULL, nallocated);
+        if (result == NULL)
            return NULL;
-        p = PyBytes_AS_STRING(v);
+        p = PyString_AS_STRING(result);
    }

    for (i = 0; i < size;) {
@ -2072,19 +2054,19 @@ encodeUCS4:
        }
    }

-    if (v == NULL) {
+    if (result == NULL) {
        /* This was stack allocated. */
        nneeded = p - stackbuf;
        assert(nneeded <= nallocated);
-        v = PyBytes_FromStringAndSize(stackbuf, nneeded);
+        result = PyString_FromStringAndSize(stackbuf, nneeded);
    }
    else {
    	/* Cut back to size actually needed. */
-        nneeded = p - PyBytes_AS_STRING(v);
+        nneeded = p - PyString_AS_STRING(result);
        assert(nneeded <= nallocated);
-        PyBytes_Resize(v, nneeded);
+        _PyString_Resize(&result, nneeded);
    }
-    return v;
+    return result;

 #undef MAX_SHORT_UNICHARS
 }
@ -2279,7 +2261,7 @@ PyUnicode_EncodeUTF32(const Py_UNICODE *s,
 		      const char *errors,
 		      int byteorder)
 {
-    PyObject *v;
+    PyObject *v, *result;
    unsigned char *p;
 #ifndef Py_UNICODE_WIDE
    int i, pairs;
@ -2319,7 +2301,7 @@ PyUnicode_EncodeUTF32(const Py_UNICODE *s,
    if (byteorder == 0)
 	STORECHAR(0xFEFF);
    if (size == 0)
-        return v;
+        goto done;

    if (byteorder == -1) {
        /* force LE */
@ -2350,7 +2332,11 @@ PyUnicode_EncodeUTF32(const Py_UNICODE *s,
 #endif
        STORECHAR(ch);
    }
-    return v;
+
+  done:
+    result = PyString_FromStringAndSize(PyBytes_AS_STRING(v), Py_Size(v));
+    Py_DECREF(v);
+    return result;
 #undef STORECHAR
 }

@ -2549,7 +2535,7 @@ PyUnicode_EncodeUTF16(const Py_UNICODE *s,
 		      const char *errors,
 		      int byteorder)
 {
-    PyObject *v;
+    PyObject *v, *result;
    unsigned char *p;
 #ifdef Py_UNICODE_WIDE
    int i, pairs;
@ -2584,7 +2570,7 @@ PyUnicode_EncodeUTF16(const Py_UNICODE *s,
    if (byteorder == 0)
 	STORECHAR(0xFEFF);
    if (size == 0)
-        return v;
+        goto done;

    if (byteorder == -1) {
        /* force LE */
@ -2610,7 +2596,11 @@ PyUnicode_EncodeUTF16(const Py_UNICODE *s,
        if (ch2)
            STORECHAR(ch2);
    }
-    return v;
+
+  done:
+    result = PyString_FromStringAndSize(PyBytes_AS_STRING(v), Py_Size(v));
+    Py_DECREF(v);
+    return result;
 #undef STORECHAR
 }

@ -2900,7 +2890,7 @@ static const char *hexdigits = "0123456789abcdef";
 PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
 					Py_ssize_t size)
 {
-    PyObject *repr;
+    PyObject *repr, *result;
    char *p;

    /* XXX(nnorwitz): rather than over-allocating, it would be
@ -3023,12 +3013,10 @@ PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
            *p++ = (char) ch;
    }

-    *p = '\0';
-    if (PyBytes_Resize(repr, p - PyBytes_AS_STRING(repr))) {
-        Py_DECREF(repr);
-        return NULL;
-    }
-    return repr;
+    result = PyString_FromStringAndSize(PyBytes_AS_STRING(repr),
+                                        p - PyBytes_AS_STRING(repr));
+    Py_DECREF(repr);
+    return result;
 }

 PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
@ -3159,7 +3147,7 @@ PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
 PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
 					   Py_ssize_t size)
 {
-    PyObject *repr;
+    PyObject *repr, *result;
    char *p;
    char *q;

@ -3171,7 +3159,7 @@ PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
    if (repr == NULL)
        return NULL;
    if (size == 0)
-	return repr;
+        goto done;

    p = q = PyBytes_AS_STRING(repr);
    while (size-- > 0) {
@ -3205,12 +3193,12 @@ PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
 	else
            *p++ = (char) ch;
    }
-    *p = '\0';
-    if (PyBytes_Resize(repr, p - q)) {
-        Py_DECREF(repr);
-        return NULL;
-    }
-    return repr;
+    size = p - q;
+
+  done:
+    result = PyString_FromStringAndSize(PyBytes_AS_STRING(repr), size);
+    Py_DECREF(repr);
+    return result;
 }

 PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
@ -3445,23 +3433,23 @@ static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
    /* pointer into the output */
    char *str;
    /* current output position */
-    Py_ssize_t respos = 0;
    Py_ssize_t ressize;
    const char *encoding = (limit == 256) ? "latin-1" : "ascii";
    const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
    PyObject *errorHandler = NULL;
    PyObject *exc = NULL;
+    PyObject *result = NULL;
    /* the following variable is used for caching string comparisons
     * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
    int known_errorHandler = -1;

    /* allocate enough for a simple encoding without
       replacements, if we need more, we'll resize */
+    if (size == 0)
+        return PyString_FromStringAndSize(NULL, 0);
    res = PyBytes_FromStringAndSize(NULL, size);
    if (res == NULL)
-        goto onError;
-    if (size == 0)
-	return res;
+        return NULL;
    str = PyBytes_AS_STRING(res);
    ressize = size;

@ -3589,20 +3577,13 @@ static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
 	    }
 	}
    }
-    /* Resize if we allocated to much */
-    respos = str - PyBytes_AS_STRING(res);
-    if (respos<ressize)
-       /* If this falls res will be NULL */
-	PyBytes_Resize(res, respos);
+    result = PyString_FromStringAndSize(PyBytes_AS_STRING(res),
+                                        str - PyBytes_AS_STRING(res));
+  onError:
+    Py_DECREF(res);
    Py_XDECREF(errorHandler);
    Py_XDECREF(exc);
-    return res;
-
-    onError:
-    Py_XDECREF(res);
-    Py_XDECREF(errorHandler);
-    Py_XDECREF(exc);
-    return NULL;
+    return result;
 }

 PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
@ -3848,20 +3829,20 @@ static int encode_mbcs(PyObject **repr,

    if (*repr == NULL) {
 	/* Create string object */
-	*repr = PyBytes_FromStringAndSize(NULL, mbcssize);
+	*repr = PyString_FromStringAndSize(NULL, mbcssize);
 	if (*repr == NULL)
 	    return -1;
    }
    else {
 	/* Extend string object */
-	n = PyBytes_Size(*repr);
-	if (PyBytes_Resize(*repr, n + mbcssize) < 0)
+	n = PyString_Size(*repr);
+	if (_PyString_Resize(repr, n + mbcssize) < 0)
 	    return -1;
    }

    /* Do the conversion */
    if (size > 0) {
-	char *s = PyBytes_AS_STRING(*repr) + n;
+	char *s = PyString_AS_STRING(*repr) + n;
 	if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
 	    PyErr_SetFromWindowsErrWithFilename(0, NULL);
 	    return -1;
@ -4341,16 +4322,14 @@ static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
 }

 static int
-charmapencode_resize(PyObject *outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
+charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
 {
-	Py_ssize_t outsize = PyBytes_GET_SIZE(  outobj);
+	Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
 	/* exponentially overallocate to minimize reallocations */
 	if (requiredsize < 2*outsize)
 	    requiredsize = 2*outsize;
-	if (PyBytes_Resize(outobj, requiredsize)) {
-	    Py_DECREF(outobj);
+	if (_PyString_Resize(outobj, requiredsize))
 	    return -1;
-	}
 	return 0;
 }

@ -4365,21 +4344,21 @@ typedef enum charmapencode_result {
   reallocation error occurred. The caller must decref the result */
 static
 charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
-    PyObject *outobj, Py_ssize_t *outpos)
+    PyObject **outobj, Py_ssize_t *outpos)
 {
    PyObject *rep;
    char *outstart;
-    Py_ssize_t outsize = PyBytes_GET_SIZE(outobj);
+    Py_ssize_t outsize = PyString_GET_SIZE(*outobj);

    if (Py_Type(mapping) == &EncodingMapType) {
        int res = encoding_map_lookup(c, mapping);
 	Py_ssize_t requiredsize = *outpos+1;
        if (res == -1)
            return enc_FAILED;
-	if (outsize<requiredsize) 
+	if (outsize<requiredsize)
 	    if (charmapencode_resize(outobj, outpos, requiredsize))
 		return enc_EXCEPTION;
-        outstart = PyBytes_AS_STRING(outobj);
+        outstart = PyString_AS_STRING(*outobj);
 	outstart[(*outpos)++] = (char)res;
 	return enc_SUCCESS;
    }
@ -4398,7 +4377,7 @@ charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
 		    Py_DECREF(rep);
 		    return enc_EXCEPTION;
 		}
-            outstart = PyBytes_AS_STRING(outobj);
+            outstart = PyString_AS_STRING(*outobj);
 	    outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
 	}
 	else {
@ -4410,7 +4389,7 @@ charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
 		    Py_DECREF(rep);
 		    return enc_EXCEPTION;
 		}
-            outstart = PyBytes_AS_STRING(outobj);
+            outstart = PyString_AS_STRING(*outobj);
 	    memcpy(outstart + *outpos, repchars, repsize);
 	    *outpos += repsize;
 	}
@ -4426,7 +4405,7 @@ int charmap_encoding_error(
    const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
    PyObject **exceptionObject,
    int *known_errorHandler, PyObject **errorHandler, const char *errors,
-    PyObject *res, Py_ssize_t *respos)
+    PyObject **res, Py_ssize_t *respos)
 {
    PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
    Py_ssize_t repsize;
@ -4561,7 +4540,7 @@ PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,

    /* allocate enough for a simple encoding without
       replacements, if we need more, we'll resize */
-    res = PyBytes_FromStringAndSize(NULL, size);
+    res = PyString_FromStringAndSize(NULL, size);
    if (res == NULL)
        goto onError;
    if (size == 0)
@ -4569,14 +4548,14 @@ PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,

    while (inpos<size) {
 	/* try to encode it */
-	charmapencode_result x = charmapencode_output(p[inpos], mapping, res, &respos);
+	charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
 	if (x==enc_EXCEPTION) /* error */
 	    goto onError;
 	if (x==enc_FAILED) { /* unencodable character */
 	    if (charmap_encoding_error(p, size, &inpos, mapping,
 		&exc,
 		&known_errorHandler, &errorHandler, errors,
-		res, &respos)) {
+		&res, &respos)) {
 		goto onError;
 	    }
 	}
@ -4586,10 +4565,9 @@ PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
    }

    /* Resize if we allocated to much */
-    if (respos<PyBytes_GET_SIZE(res)) {
-	if (PyBytes_Resize(res, respos))
-	    goto onError;
-    }
+    if (respos<PyString_GET_SIZE(res))
+	_PyString_Resize(&res, respos);
+
    Py_XDECREF(exc);
    Py_XDECREF(errorHandler);
    return res;
@ -5483,20 +5461,14 @@ PyUnicode_Join(PyObject *separator, PyObject *seq)

 	item = PySequence_Fast_GET_ITEM(fseq, i);
 	/* Convert item to Unicode. */
-	if (!PyString_Check(item) && !PyUnicode_Check(item))
-	{
-		if (PyBytes_Check(item))
-		{
-			PyErr_Format(PyExc_TypeError,
-                            "sequence item %d: join() will not operate on "
-                            "bytes objects", i);
-			goto onError;
-		}
-		item = PyObject_Unicode(item);
+	if (!PyUnicode_Check(item)) {
+	    PyErr_Format(PyExc_TypeError,
+			 "sequence item %zd: expected str instance,"
+			 " %.80s found",
+			 i, Py_Type(item)->tp_name);
+	    goto onError;
 	}
-	else
-		item = PyUnicode_FromObject(item);
-
+	item = PyUnicode_FromObject(item);
 	if (item == NULL)
 	    goto onError;
 	/* We own a reference to item from here on. */
@ -6396,9 +6368,6 @@ PyObject *PyUnicode_Concat(PyObject *left,
 {
    PyUnicodeObject *u = NULL, *v = NULL, *w;

-    if (PyBytes_Check(left) || PyBytes_Check(right))
-        return PyBytes_Concat(left, right);
-
    /* Coerce the two arguments */
    u = (PyUnicodeObject *)PyUnicode_FromObject(left);
    if (u == NULL)
@ -6515,7 +6484,7 @@ unicode_encode(PyUnicodeObject *self, PyObject *args)
    v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
    if (v == NULL)
        goto onError;
-    if (!PyBytes_Check(v)) {
+    if (!PyString_Check(v)) {
        PyErr_Format(PyExc_TypeError,
                     "encoder did not return a bytes object "
                     "(type=%.400s)",
@ -8232,12 +8201,6 @@ getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
    return NULL;
 }

-#define F_LJUST (1<<0)
-#define F_SIGN	(1<<1)
-#define F_BLANK (1<<2)
-#define F_ALT	(1<<3)
-#define F_ZERO	(1<<4)
-
 static Py_ssize_t
 strtounicode(Py_UNICODE *buffer, const char *charbuffer)
 {