Add 'U'/'U#' format characters to Py_BuildValue (and thus

to PyObject_CallFunction()) that take a char * (and a size
in the case of 'U#') and create a unicode object out of it.

Add functions PyUnicode_FromFormat() and PyUnicode_FromFormatV()
that work similar to PyString_FromFormat(), but create a unicode
object (also a %U format character has been added, that takes
a PyObject *, which must point to a unicode object).

Change the encoding and reason attributes of UnicodeEncodeError,
UnicodeDecodeError and UnicodeTranslateError to be unicode
objects.
This commit is contained in:
Walter Dörwald 2007-05-18 16:29:38 +00:00
parent 5550731d9c
commit d2034310d6
6 changed files with 376 additions and 113 deletions

View file

@ -393,15 +393,9 @@ PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
return (PyObject *)unicode;
}
PyObject *PyUnicode_FromString(const char *u)
PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
{
PyUnicodeObject *unicode;
size_t size = strlen(u);
if (size > PY_SSIZE_T_MAX) {
PyErr_SetString(PyExc_OverflowError, "input too long");
return NULL;
}
/* If the Unicode data is known at construction time, we can apply
some optimizations which share commonly used objects. */
if (u != NULL) {
@ -441,6 +435,17 @@ PyObject *PyUnicode_FromString(const char *u)
return (PyObject *)unicode;
}
PyObject *PyUnicode_FromString(const char *u)
{
size_t size = strlen(u);
if (size > PY_SSIZE_T_MAX) {
PyErr_SetString(PyExc_OverflowError, "input too long");
return NULL;
}
return PyUnicode_FromStringAndSize(u, size);
}
#ifdef HAVE_WCHAR_H
PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
@ -473,6 +478,223 @@ PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
return (PyObject *)unicode;
}
#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
PyObject *
PyUnicode_FromFormatV(const char *format, va_list vargs)
{
va_list count;
Py_ssize_t n = 0;
const char* f;
Py_UNICODE *s;
PyObject *string;
/* used by sprintf */
char buffer[21];
const char *copy;
#ifdef VA_LIST_IS_ARRAY
Py_MEMCPY(count, vargs, sizeof(va_list));
#else
#ifdef __va_copy
__va_copy(count, vargs);
#else
count = vargs;
#endif
#endif
/* step 1: figure out how large a buffer we need */
for (f = format; *f; f++) {
if (*f == '%') {
const char* p = f;
while (*++f && *f != '%' && !isalpha(Py_CHARMASK(*f)))
;
/* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
* they don't affect the amount of space we reserve.
*/
if ((*f == 'l' || *f == 'z') &&
(f[1] == 'd' || f[1] == 'u'))
++f;
switch (*f) {
case 'c':
(void)va_arg(count, int);
/* fall through... */
case '%':
n++;
break;
case 'd': case 'u': case 'i': case 'x':
(void) va_arg(count, int);
/* 20 bytes is enough to hold a 64-bit
integer. Decimal takes the most space.
This isn't enough for octal. */
n += 20;
break;
case 's':
n += strlen(va_arg(count, char*));
break;
case 'U':
{
PyObject *obj = va_arg(count, PyObject *);
assert(obj && PyUnicode_Check(obj));
n += PyUnicode_GET_SIZE(obj);
break;
}
case 'p':
(void) va_arg(count, int);
/* maximum 64-bit pointer representation:
* 0xffffffffffffffff
* so 19 characters is enough.
* XXX I count 18 -- what's the extra for?
*/
n += 19;
break;
default:
/* if we stumble upon an unknown
formatting code, copy the rest of
the format string to the output
string. (we cannot just skip the
code, since there's no way to know
what's in the argument list) */
n += strlen(p);
goto expand;
}
} else
n++;
}
expand:
/* step 2: fill the buffer */
/* Since we've analyzed how much space we need for the worst case,
we don't have to resize the string. */
string = PyUnicode_FromUnicode(NULL, n);
if (!string)
return NULL;
s = PyUnicode_AS_UNICODE(string);
for (f = format; *f; f++) {
if (*f == '%') {
const char* p = f++;
int longflag = 0;
int size_tflag = 0;
/* parse the width.precision part (we're only
interested in the precision value, if any) */
n = 0;
while (isdigit(Py_CHARMASK(*f)))
n = (n*10) + *f++ - '0';
if (*f == '.') {
f++;
n = 0;
while (isdigit(Py_CHARMASK(*f)))
n = (n*10) + *f++ - '0';
}
while (*f && *f != '%' && !isalpha(Py_CHARMASK(*f)))
f++;
/* handle the long flag, but only for %ld and %lu.
others can be added when necessary. */
if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
longflag = 1;
++f;
}
/* handle the size_t flag. */
if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
size_tflag = 1;
++f;
}
switch (*f) {
case 'c':
*s++ = va_arg(vargs, int);
break;
case 'd':
if (longflag)
sprintf(buffer, "%ld", va_arg(vargs, long));
else if (size_tflag)
sprintf(buffer, "%" PY_FORMAT_SIZE_T "d",
va_arg(vargs, Py_ssize_t));
else
sprintf(buffer, "%d", va_arg(vargs, int));
appendstring(buffer);
break;
case 'u':
if (longflag)
sprintf(buffer, "%lu",
va_arg(vargs, unsigned long));
else if (size_tflag)
sprintf(buffer, "%" PY_FORMAT_SIZE_T "u",
va_arg(vargs, size_t));
else
sprintf(buffer, "%u",
va_arg(vargs, unsigned int));
appendstring(buffer);
break;
case 'i':
sprintf(buffer, "%i", va_arg(vargs, int));
appendstring(buffer);
break;
case 'x':
sprintf(buffer, "%x", va_arg(vargs, int));
appendstring(buffer);
break;
case 's':
p = va_arg(vargs, char*);
appendstring(p);
break;
case 'U':
{
PyObject *obj = va_arg(vargs, PyObject *);
Py_UNICODE *ucopy = PyUnicode_AS_UNICODE(obj);
Py_ssize_t usize = PyUnicode_GET_SIZE(obj);
Py_ssize_t upos;
for (upos = 0; upos<usize;)
*s++ = ucopy[upos++];
break;
}
case 'p':
sprintf(buffer, "%p", va_arg(vargs, void*));
/* %p is ill-defined: ensure leading 0x. */
if (buffer[1] == 'X')
buffer[1] = 'x';
else if (buffer[1] != 'x') {
memmove(buffer+2, buffer, strlen(buffer)+1);
buffer[0] = '0';
buffer[1] = 'x';
}
appendstring(buffer);
break;
case '%':
*s++ = '%';
break;
default:
appendstring(p);
goto end;
}
} else
*s++ = *f;
}
end:
_PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
return string;
}
#undef appendstring
PyObject *
PyUnicode_FromFormat(const char *format, ...)
{
PyObject* ret;
va_list vargs;
#ifdef HAVE_STDARG_PROTOTYPES
va_start(vargs, format);
#else
va_start(vargs);
#endif
ret = PyUnicode_FromFormatV(format, vargs);
va_end(vargs);
return ret;
}
Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
wchar_t *w,
Py_ssize_t size)