Merging the py3k-pep3137 branch back into the py3k branch.

No detailed change log; just check out the change log for the py3k-pep3137
branch.  The most obvious changes:

  - str8 renamed to bytes (PyString at the C level);
  - bytes renamed to buffer (PyBytes at the C level);
  - PyString and PyUnicode are no longer compatible.

I.e. we now have an immutable bytes type and a mutable bytes type.

The behavior of PyString was modified quite a bit, to make it more
bytes-like.  Some changes are still on the to-do list.
This commit is contained in:
Guido van Rossum 2007-11-06 21:34:58 +00:00
parent a19f80c6df
commit 98297ee781
148 changed files with 2533 additions and 3517 deletions

View file

@ -101,7 +101,7 @@ extern "C" {
function will delete the reference from this dictionary.
Another way to look at this is that to say that the actual reference
count of a string is: s->ob_refcnt + (s->ob_sstate?2:0)
count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
*/
static PyObject *interned;
@ -998,7 +998,10 @@ PyObject *PyUnicode_FromObject(register PyObject *obj)
return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
PyUnicode_GET_SIZE(obj));
}
return PyUnicode_FromEncodedObject(obj, NULL, "strict");
PyErr_Format(PyExc_TypeError,
"Can't convert '%.100s' object to str implicitly",
Py_Type(obj)->tp_name);
return NULL;
}
PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
@ -1219,22 +1222,7 @@ PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
v = PyCodec_Encode(unicode, encoding, errors);
if (v == NULL)
goto onError;
if (!PyBytes_Check(v)) {
if (PyString_Check(v)) {
/* Old codec, turn it into bytes */
PyObject *b = PyBytes_FromObject(v);
Py_DECREF(v);
return b;
}
PyErr_Format(PyExc_TypeError,
"encoder did not return a bytes object "
"(type=%.400s, encoding=%.20s, errors=%.20s)",
v->ob_type->tp_name,
encoding ? encoding : "NULL",
errors ? errors : "NULL");
Py_DECREF(v);
goto onError;
}
assert(PyString_Check(v));
return v;
onError:
@ -1245,19 +1233,15 @@ PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
const char *errors)
{
PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
PyObject *b;
if (v)
return v;
if (errors != NULL)
Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString");
b = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
v = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
PyUnicode_GET_SIZE(unicode),
NULL);
if (!b)
if (!v)
return NULL;
v = PyString_FromStringAndSize(PyBytes_AsString(b),
PyBytes_Size(b));
Py_DECREF(b);
((PyUnicodeObject *)unicode)->defenc = v;
return v;
}
@ -1420,11 +1404,11 @@ int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler
inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
if (!inputobj)
goto onError;
if (!PyBytes_Check(inputobj)) {
if (!PyString_Check(inputobj)) {
PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
}
*input = PyBytes_AS_STRING(inputobj);
insize = PyBytes_GET_SIZE(inputobj);
*input = PyString_AS_STRING(inputobj);
insize = PyString_GET_SIZE(inputobj);
*inend = *input + insize;
/* we can DECREF safely, as the exception has another reference,
so the object won't go away. */
@ -1674,7 +1658,7 @@ PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
int encodeWhiteSpace,
const char *errors)
{
PyObject *v;
PyObject *v, *result;
/* It might be possible to tighten this worst case */
Py_ssize_t cbAllocated = 5 * size;
int inShift = 0;
@ -1685,7 +1669,7 @@ PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
char * start;
if (size == 0)
return PyBytes_FromStringAndSize(NULL, 0);
return PyString_FromStringAndSize(NULL, 0);
v = PyBytes_FromStringAndSize(NULL, cbAllocated);
if (v == NULL)
@ -1757,11 +1741,9 @@ PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
*out++ = '-';
}
if (PyBytes_Resize(v, out - start)) {
Py_DECREF(v);
return NULL;
}
return v;
result = PyString_FromStringAndSize(PyBytes_AS_STRING(v), out - start);
Py_DECREF(v);
return result;
}
#undef SPECIAL
@ -2001,11 +1983,11 @@ PyUnicode_EncodeUTF8(const Py_UNICODE *s,
{
#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Py_ssize_t i; /* index into s of next input byte */
PyObject *v; /* result string object */
char *p; /* next free byte in output buffer */
Py_ssize_t nallocated; /* number of result bytes allocated */
Py_ssize_t nneeded; /* number of result bytes needed */
Py_ssize_t i; /* index into s of next input byte */
PyObject *result; /* result string object */
char *p; /* next free byte in output buffer */
Py_ssize_t nallocated; /* number of result bytes allocated */
Py_ssize_t nneeded; /* number of result bytes needed */
char stackbuf[MAX_SHORT_UNICHARS * 4];
assert(s != NULL);
@ -2017,7 +1999,7 @@ PyUnicode_EncodeUTF8(const Py_UNICODE *s,
* turns out we need.
*/
nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
v = NULL; /* will allocate after we're done */
result = NULL; /* will allocate after we're done */
p = stackbuf;
}
else {
@ -2025,10 +2007,10 @@ PyUnicode_EncodeUTF8(const Py_UNICODE *s,
nallocated = size * 4;
if (nallocated / 4 != size) /* overflow! */
return PyErr_NoMemory();
v = PyBytes_FromStringAndSize(NULL, nallocated);
if (v == NULL)
result = PyString_FromStringAndSize(NULL, nallocated);
if (result == NULL)
return NULL;
p = PyBytes_AS_STRING(v);
p = PyString_AS_STRING(result);
}
for (i = 0; i < size;) {
@ -2072,19 +2054,19 @@ encodeUCS4:
}
}
if (v == NULL) {
if (result == NULL) {
/* This was stack allocated. */
nneeded = p - stackbuf;
assert(nneeded <= nallocated);
v = PyBytes_FromStringAndSize(stackbuf, nneeded);
result = PyString_FromStringAndSize(stackbuf, nneeded);
}
else {
/* Cut back to size actually needed. */
nneeded = p - PyBytes_AS_STRING(v);
nneeded = p - PyString_AS_STRING(result);
assert(nneeded <= nallocated);
PyBytes_Resize(v, nneeded);
_PyString_Resize(&result, nneeded);
}
return v;
return result;
#undef MAX_SHORT_UNICHARS
}
@ -2279,7 +2261,7 @@ PyUnicode_EncodeUTF32(const Py_UNICODE *s,
const char *errors,
int byteorder)
{
PyObject *v;
PyObject *v, *result;
unsigned char *p;
#ifndef Py_UNICODE_WIDE
int i, pairs;
@ -2319,7 +2301,7 @@ PyUnicode_EncodeUTF32(const Py_UNICODE *s,
if (byteorder == 0)
STORECHAR(0xFEFF);
if (size == 0)
return v;
goto done;
if (byteorder == -1) {
/* force LE */
@ -2350,7 +2332,11 @@ PyUnicode_EncodeUTF32(const Py_UNICODE *s,
#endif
STORECHAR(ch);
}
return v;
done:
result = PyString_FromStringAndSize(PyBytes_AS_STRING(v), Py_Size(v));
Py_DECREF(v);
return result;
#undef STORECHAR
}
@ -2549,7 +2535,7 @@ PyUnicode_EncodeUTF16(const Py_UNICODE *s,
const char *errors,
int byteorder)
{
PyObject *v;
PyObject *v, *result;
unsigned char *p;
#ifdef Py_UNICODE_WIDE
int i, pairs;
@ -2584,7 +2570,7 @@ PyUnicode_EncodeUTF16(const Py_UNICODE *s,
if (byteorder == 0)
STORECHAR(0xFEFF);
if (size == 0)
return v;
goto done;
if (byteorder == -1) {
/* force LE */
@ -2610,7 +2596,11 @@ PyUnicode_EncodeUTF16(const Py_UNICODE *s,
if (ch2)
STORECHAR(ch2);
}
return v;
done:
result = PyString_FromStringAndSize(PyBytes_AS_STRING(v), Py_Size(v));
Py_DECREF(v);
return result;
#undef STORECHAR
}
@ -2900,7 +2890,7 @@ static const char *hexdigits = "0123456789abcdef";
PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Py_ssize_t size)
{
PyObject *repr;
PyObject *repr, *result;
char *p;
/* XXX(nnorwitz): rather than over-allocating, it would be
@ -3023,12 +3013,10 @@ PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
*p++ = (char) ch;
}
*p = '\0';
if (PyBytes_Resize(repr, p - PyBytes_AS_STRING(repr))) {
Py_DECREF(repr);
return NULL;
}
return repr;
result = PyString_FromStringAndSize(PyBytes_AS_STRING(repr),
p - PyBytes_AS_STRING(repr));
Py_DECREF(repr);
return result;
}
PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
@ -3159,7 +3147,7 @@ PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Py_ssize_t size)
{
PyObject *repr;
PyObject *repr, *result;
char *p;
char *q;
@ -3171,7 +3159,7 @@ PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
if (repr == NULL)
return NULL;
if (size == 0)
return repr;
goto done;
p = q = PyBytes_AS_STRING(repr);
while (size-- > 0) {
@ -3205,12 +3193,12 @@ PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
else
*p++ = (char) ch;
}
*p = '\0';
if (PyBytes_Resize(repr, p - q)) {
Py_DECREF(repr);
return NULL;
}
return repr;
size = p - q;
done:
result = PyString_FromStringAndSize(PyBytes_AS_STRING(repr), size);
Py_DECREF(repr);
return result;
}
PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
@ -3445,23 +3433,23 @@ static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
/* pointer into the output */
char *str;
/* current output position */
Py_ssize_t respos = 0;
Py_ssize_t ressize;
const char *encoding = (limit == 256) ? "latin-1" : "ascii";
const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
PyObject *errorHandler = NULL;
PyObject *exc = NULL;
PyObject *result = NULL;
/* the following variable is used for caching string comparisons
* -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
int known_errorHandler = -1;
/* allocate enough for a simple encoding without
replacements, if we need more, we'll resize */
if (size == 0)
return PyString_FromStringAndSize(NULL, 0);
res = PyBytes_FromStringAndSize(NULL, size);
if (res == NULL)
goto onError;
if (size == 0)
return res;
return NULL;
str = PyBytes_AS_STRING(res);
ressize = size;
@ -3589,20 +3577,13 @@ static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
}
}
}
/* Resize if we allocated to much */
respos = str - PyBytes_AS_STRING(res);
if (respos<ressize)
/* If this falls res will be NULL */
PyBytes_Resize(res, respos);
result = PyString_FromStringAndSize(PyBytes_AS_STRING(res),
str - PyBytes_AS_STRING(res));
onError:
Py_DECREF(res);
Py_XDECREF(errorHandler);
Py_XDECREF(exc);
return res;
onError:
Py_XDECREF(res);
Py_XDECREF(errorHandler);
Py_XDECREF(exc);
return NULL;
return result;
}
PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
@ -3848,20 +3829,20 @@ static int encode_mbcs(PyObject **repr,
if (*repr == NULL) {
/* Create string object */
*repr = PyBytes_FromStringAndSize(NULL, mbcssize);
*repr = PyString_FromStringAndSize(NULL, mbcssize);
if (*repr == NULL)
return -1;
}
else {
/* Extend string object */
n = PyBytes_Size(*repr);
if (PyBytes_Resize(*repr, n + mbcssize) < 0)
n = PyString_Size(*repr);
if (_PyString_Resize(repr, n + mbcssize) < 0)
return -1;
}
/* Do the conversion */
if (size > 0) {
char *s = PyBytes_AS_STRING(*repr) + n;
char *s = PyString_AS_STRING(*repr) + n;
if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
PyErr_SetFromWindowsErrWithFilename(0, NULL);
return -1;
@ -4341,16 +4322,14 @@ static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
}
static int
charmapencode_resize(PyObject *outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
{
Py_ssize_t outsize = PyBytes_GET_SIZE( outobj);
Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
/* exponentially overallocate to minimize reallocations */
if (requiredsize < 2*outsize)
requiredsize = 2*outsize;
if (PyBytes_Resize(outobj, requiredsize)) {
Py_DECREF(outobj);
if (_PyString_Resize(outobj, requiredsize))
return -1;
}
return 0;
}
@ -4365,21 +4344,21 @@ typedef enum charmapencode_result {
reallocation error occurred. The caller must decref the result */
static
charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
PyObject *outobj, Py_ssize_t *outpos)
PyObject **outobj, Py_ssize_t *outpos)
{
PyObject *rep;
char *outstart;
Py_ssize_t outsize = PyBytes_GET_SIZE(outobj);
Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
if (Py_Type(mapping) == &EncodingMapType) {
int res = encoding_map_lookup(c, mapping);
Py_ssize_t requiredsize = *outpos+1;
if (res == -1)
return enc_FAILED;
if (outsize<requiredsize)
if (outsize<requiredsize)
if (charmapencode_resize(outobj, outpos, requiredsize))
return enc_EXCEPTION;
outstart = PyBytes_AS_STRING(outobj);
outstart = PyString_AS_STRING(*outobj);
outstart[(*outpos)++] = (char)res;
return enc_SUCCESS;
}
@ -4398,7 +4377,7 @@ charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Py_DECREF(rep);
return enc_EXCEPTION;
}
outstart = PyBytes_AS_STRING(outobj);
outstart = PyString_AS_STRING(*outobj);
outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
}
else {
@ -4410,7 +4389,7 @@ charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Py_DECREF(rep);
return enc_EXCEPTION;
}
outstart = PyBytes_AS_STRING(outobj);
outstart = PyString_AS_STRING(*outobj);
memcpy(outstart + *outpos, repchars, repsize);
*outpos += repsize;
}
@ -4426,7 +4405,7 @@ int charmap_encoding_error(
const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
PyObject **exceptionObject,
int *known_errorHandler, PyObject **errorHandler, const char *errors,
PyObject *res, Py_ssize_t *respos)
PyObject **res, Py_ssize_t *respos)
{
PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Py_ssize_t repsize;
@ -4561,7 +4540,7 @@ PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
/* allocate enough for a simple encoding without
replacements, if we need more, we'll resize */
res = PyBytes_FromStringAndSize(NULL, size);
res = PyString_FromStringAndSize(NULL, size);
if (res == NULL)
goto onError;
if (size == 0)
@ -4569,14 +4548,14 @@ PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
while (inpos<size) {
/* try to encode it */
charmapencode_result x = charmapencode_output(p[inpos], mapping, res, &respos);
charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
if (x==enc_EXCEPTION) /* error */
goto onError;
if (x==enc_FAILED) { /* unencodable character */
if (charmap_encoding_error(p, size, &inpos, mapping,
&exc,
&known_errorHandler, &errorHandler, errors,
res, &respos)) {
&res, &respos)) {
goto onError;
}
}
@ -4586,10 +4565,9 @@ PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
}
/* Resize if we allocated to much */
if (respos<PyBytes_GET_SIZE(res)) {
if (PyBytes_Resize(res, respos))
goto onError;
}
if (respos<PyString_GET_SIZE(res))
_PyString_Resize(&res, respos);
Py_XDECREF(exc);
Py_XDECREF(errorHandler);
return res;
@ -5483,20 +5461,14 @@ PyUnicode_Join(PyObject *separator, PyObject *seq)
item = PySequence_Fast_GET_ITEM(fseq, i);
/* Convert item to Unicode. */
if (!PyString_Check(item) && !PyUnicode_Check(item))
{
if (PyBytes_Check(item))
{
PyErr_Format(PyExc_TypeError,
"sequence item %d: join() will not operate on "
"bytes objects", i);
goto onError;
}
item = PyObject_Unicode(item);
if (!PyUnicode_Check(item)) {
PyErr_Format(PyExc_TypeError,
"sequence item %zd: expected str instance,"
" %.80s found",
i, Py_Type(item)->tp_name);
goto onError;
}
else
item = PyUnicode_FromObject(item);
item = PyUnicode_FromObject(item);
if (item == NULL)
goto onError;
/* We own a reference to item from here on. */
@ -6396,9 +6368,6 @@ PyObject *PyUnicode_Concat(PyObject *left,
{
PyUnicodeObject *u = NULL, *v = NULL, *w;
if (PyBytes_Check(left) || PyBytes_Check(right))
return PyBytes_Concat(left, right);
/* Coerce the two arguments */
u = (PyUnicodeObject *)PyUnicode_FromObject(left);
if (u == NULL)
@ -6515,7 +6484,7 @@ unicode_encode(PyUnicodeObject *self, PyObject *args)
v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
if (v == NULL)
goto onError;
if (!PyBytes_Check(v)) {
if (!PyString_Check(v)) {
PyErr_Format(PyExc_TypeError,
"encoder did not return a bytes object "
"(type=%.400s)",
@ -8232,12 +8201,6 @@ getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
return NULL;
}
#define F_LJUST (1<<0)
#define F_SIGN (1<<1)
#define F_BLANK (1<<2)
#define F_ALT (1<<3)
#define F_ZERO (1<<4)
static Py_ssize_t
strtounicode(Py_UNICODE *buffer, const char *charbuffer)
{